diff --git a/.github/actions/nf-test/action.yml b/.github/actions/nf-test/action.yml index 3b9724c7..41ec7fc3 100644 --- a/.github/actions/nf-test/action.yml +++ b/.github/actions/nf-test/action.yml @@ -68,7 +68,7 @@ runs: --changed-since HEAD^ \ --verbose \ --tap=test.tap \ - --shard ${{ inputs.shard }}/${{ inputs.total_shards }} + --shard ${{ inputs.shard }}/${{ inputs.total_shards }} --debug # Save the absolute path of the test.tap file to the output echo "tap_file_path=$(realpath test.tap)" >> $GITHUB_OUTPUT diff --git a/.github/workflows/nf-test.yml b/.github/workflows/nf-test.yml index c98d76ec..1668cf73 100644 --- a/.github/workflows/nf-test.yml +++ b/.github/workflows/nf-test.yml @@ -50,7 +50,7 @@ jobs: env: NFT_VER: ${{ env.NFT_VER }} with: - max_shards: 7 + max_shards: 10 - name: debug run: | diff --git a/.gitignore b/.gitignore index a42ce016..b91d76b2 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,14 @@ testing/ testing* *.pyc null/ +.nf-test* +.idea/ +.vscode/ +taggers/ +tokenizers/ +corpora/ +.github/act.custom_runner.Dockerfile +.ruff_cache +galaxy/test_output/ +TODO +test/ diff --git a/.nf-core.yml b/.nf-core.yml index ce1370fb..9d83d644 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -6,16 +6,22 @@ lint: - conf/igenomes_ignored.config files_unchanged: - assets/nf-core-stableexpression_logo_light.png + - docs/images/nf-core-stableexpression_logo_light.png + - docs/images/nf-core-stableexpression_logo_dark.png - .github/PULL_REQUEST_TEMPLATE.md nextflow_config: - params.input + template_strings: + - tests/test_data/genorm/compute_m_measure/input/std.0.0.parquet + - tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet + - tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet schema_lint: false -nf_core_version: 3.5.1 + +nf_core_version: 3.5.2 repository_type: pipeline template: author: Olivier Coen - description: This pipeline is dedicated to finding the most stable genes across - count datasets + description: This pipeline is dedicated to identifying the most stable genes within a single or multiple expression dataset(s). This is particularly useful for identifying the most suitable RT-qPCR reference genes for a specific species. force: false is_nfcore: true name: stableexpression @@ -24,4 +30,4 @@ template: skip_features: - igenomes - fastqc - version: 1.0dev + version: 1.0.0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d06777a8..c7942f15 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,10 +1,12 @@ repos: - repo: https://github.com/pre-commit/mirrors-prettier - rev: "v3.1.0" + rev: "v4.0.0-alpha.8" hooks: - id: prettier additional_dependencies: - prettier@3.6.2 + exclude: galaxy/ + - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: @@ -25,3 +27,15 @@ repos: subworkflows/nf-core/.*| .*\.snap$ )$ + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.14.1 + hooks: + # Run the linter. + - id: ruff + files: \.py$ + args: [--fix] + exclude: bin/old/ + # Run the formatter. + - id: ruff-format + files: \.py$ diff --git a/.prettierignore b/.prettierignore index dd749d43..7cc55006 100644 --- a/.prettierignore +++ b/.prettierignore @@ -14,3 +14,6 @@ bin/ ro-crate-metadata.json modules/nf-core/ subworkflows/nf-core/ +galaxy/ +docs/ +tests/act diff --git a/CHANGELOG.md b/CHANGELOG.md index d3ef518e..798c10f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,9 +3,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v1.0dev - [date] +## v1.0.0 - 18/03/2026 -Initial release of nf-core/stableexpression, created with the [nf-core](https://nf-co.re/) template. +First complete, official release of nf-core/stableexpression. + +## v1.0dev - 26/01/2025 + +Initial pre-release of nf-core/stableexpression, created with the [nf-core](https://nf-co.re/) template. ### `Added` diff --git a/CITATIONS.md b/CITATIONS.md index 741ce1ed..e7423ab9 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,6 +10,26 @@ ## Pipeline tools +- [EBI Expression Atlas](https://www.ebi.ac.uk/gxa/home) + +> Papatheodorou I, Fonseca NA, Keays M, Tang YA, Barrera E, Bazant W, Burke M, Füllgrabe A, Muñoz-Pomer Fuentes A, George N, Huerta L, Koskinen S, Mohammed S, Geniza M, Preece J, Jaiswal P, Jarnuczak AF, Huber W, Stegle O, Vizcaino JA, Brazma A, Petryszak R. Expression Atlas: gene and protein expression across multiple studies and organisms. Nucleic Acids Res. 2017 Nov 20;46(Database issue):D246–D251. doi: 10.1093/nar/gkx1158. PubMed PMID: 29165655. + +- [NCBI GEO](https://www.ncbi.nlm.nih.gov/geo/) + +> Ron Edgar, Michael Domrachev & Alex E Lash. Gene Expression Omnibus: NCBI gene expression and hybridization array data repository. Nucleic Acids Res. 2002 Jan 1;30(1):207-10. doi: 10.1093/nar/30.1.207. PubMed PMID: 11752295. + +- [g:Profiler](https://biit.cs.ut.ee/gprofiler/gost) + +> Reimand J, Kull M, Peterson H, Hansen J, Vilo J. g:Profiler—a web-based toolset for functional profiling of gene lists from large-scale experiments. Nucleic Acids Res. 2007 May 3;35(Web Server issue):W193–W200. doi:10.1093/nar/gkm226. PubMed PMID: 17478515. + +- [Normfinder](https://rdrr.io/github/dhammarstrom/generefer/man/normfinder.html) + +> Claus Lindbjerg Andersen, Jens Ledet Jensen, Torben Falck Ørntoft. Normalization of Real-Time Quantitative Reverse Transcription-PCR Data: A Model-Based Variance Estimation Approach to Identify Genes Suited for Normalization, Applied to Bladder and Colon Cancer Data Sets. Cancer Res (2004) 64 (15): 5245–5250. doi:10.1158/0008-5472.CAN-04-0496. PubMed PMID: 15289330. + +- [GeNorm](https://pypi.org/project/rna-genorm/) + +> Jo Vandesompele, Katleen De Preter, Filip Pattyn, Bruce Poppe, Nadine Van Roy, Anne De Paepe, Frank Speleman. Accurate normalization of real-time quantitative RT-PCR data by geometric averaging of multiple internal control genes. Genome Biol. 2002 Jun 18;3(7):RESEARCH0034. doi: 10.1186/gb-2002-3-7-research0034 Pubmed PMID: 12184808. + - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. diff --git a/README.md b/README.md index e6a52620..bea23099 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ [![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/) [![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1) +[![run with apptainer](https://custom-icon-badges.demolab.com/badge/run%20with-apptainer-4545?logo=apptainer&color=teal&labelColor=000000)](https://apptainer.org/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) @@ -21,54 +22,122 @@ ## Introduction -**nf-core/stableexpression** is a bioinformatics pipeline that ... +**nf-core/stableexpression** is a bioinformatics pipeline aiming to aggregate multiple count datasets for a specific species and find the most stable genes. The datasets can be either downloaded from public databases (EBI, NCBI) or provided directly by the user. Both RNA-seq and Microarray count datasets can be utilised. - +

+ +

- -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +It takes as main inputs : -## Usage +- a species name (mandatory) +- keywords for Expression Atlas / GEO search (optional) +- a CSV input file listing your own raw / normalised count datasets (optional). + +**Use cases**: + +- **find the most suitable genes as RT-qPCR reference genes for a specific species (and optionally specific conditions)** +- download all Expression Atlas and / or NCBI GEO datasets for a species (and optionally keywords) + +## Pipeline overview + +The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: + +#### 1. Get accessions from public databases + +- Get [Expression Atlas](https://www.ebi.ac.uk/gxa/home) dataset accessions corresponding to the provided species (and optionally keywords) + This step is run by default but is optional. Set `--skip_fetch_eatlas_accessions` to skip it. +- Get NBCI [GEO](https://www.ncbi.nlm.nih.gov/gds) **microarray** dataset accessions corresponding to the provided species (and optionally keywords) + This is optional and **NOT** run by default. Set `--fetch_geo_accessions` to run it. + +#### 2. Download data (see [usage](./conf/usage.md#3-provide-your-own-accessions)) + +- Download [Expression Atlas](https://www.ebi.ac.uk/gxa/home) data if any +- Download NBCI [GEO](https://www.ncbi.nlm.nih.gov/gds) data if any > [!NOTE] -> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. +> At this point, datasets downloaded from public databases are merged with datasets provided by the user using the `--datasets` parameter. See [usage](./conf/usage.md#4-use-your-own-expression-datasets) for more information about local datasets. - +#### 10. Reporting -Now, you can run the pipeline using: +- Result aggregation +- Make [`MultiQC`](http://multiqc.info/) report +- Prepare [Dash Plotly](https://dash.plotly.com/) app for further investigation of gene / sample counts - +## Test pipeline + +You can test the execution of the pipeline locally with: + +```bash +nextflow run nf-core/stableexpression -profile test, +``` + +## Basic usage + +> [!NOTE] +> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. + +To search the most stable genes in a species considering all public datasets, simply run: ```bash nextflow run nf-core/stableexpression \ - -profile \ - --input samplesheet.csv \ - --outdir + -profile \ + --species \ + --outdir \ + -resume ``` -> [!WARNING] -> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files). +## More advanced usage + +For more specific scenarios, like: -For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/stableexpression/usage) and the [parameter documentation](https://nf-co.re/stableexpression/parameters). +- **fetching only specific conditions** +- **using your own expression dataset(s)** + +please refer to the [usage documentation](https://nf-co.re/stableexpression/usage). + +## Resource allocation + +For setting pipeline CPU / memory usage, see [here](./docs/configuration.md). + +## Profiles + +See [here](https://nf-co.re/stableexpression/usage#profiles) for more information about profiles. ## Pipeline output @@ -76,13 +145,21 @@ To see the results of an example test run with a full size dataset refer to the For more details about the output files and reports, please refer to the [output documentation](https://nf-co.re/stableexpression/output). +## Support us + +If you like nf-core/stableexpression, please make sure you give it a star on GitHub! + +[![stars - stableexpression](https://img.shields.io/github/stars/nf-core/stableexpression?style=social)](https://github.com/nf-core/stableexpression) + ## Credits nf-core/stableexpression was originally written by Olivier Coen. -We thank the following people for their extensive assistance in the development of this pipeline: +We thank the following people for their assistance in the development of this pipeline: - +- Rémy Costa +- Shaheen Acheche +- Janine Soares ## Contributions and Support @@ -95,8 +172,6 @@ For further information or help, don't hesitate to get in touch on the [Slack `# - - An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. You can cite the `nf-core` publication as follows: diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index e116bbee..d4670c25 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -3,8 +3,6 @@ description: "Suggested text and references to use when describing pipeline usag section_name: "nf-core/stableexpression Methods Description" section_href: "https://github.com/nf-core/stableexpression" plot_type: "html" -## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline -## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

Data was processed using nf-core/stableexpression v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (Grüning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

diff --git a/assets/multiqc_config.custom_content.template.yaml b/assets/multiqc_config.custom_content.template.yaml new file mode 100644 index 00000000..fae01dd9 --- /dev/null +++ b/assets/multiqc_config.custom_content.template.yaml @@ -0,0 +1,690 @@ +custom_data: + ranked_most_stable_genes_summary_template: + section_name: "Most stable genes" + file_format: "csv" + no_violin: true + description: "Expression descriptive statistics of all genes, ranked by stability. Genes are sorted by stability score - from the most stable to the least stable." + plot_type: "table" + pconfig: + col1_header: "Gene ID" + sort_rows: false + headers: + gene_id: + title: "Gene ID" + rank: + title: "Rank" + description: Rank of the gene based on stability score + scale: "RdYlGn-rev" + cond_formatting_rules: + between_fourth_and_tenth: + - eq: 4 + - eq: 5 + - eq: 6 + - eq: 7 + - eq: 8 + - eq: 9 + - eq: 10 + third: + - eq: 3 + second: + - eq: 2 + first: + - eq: 1 + name: + title: "Gene name" + description: + title: "Gene description" + original_gene_ids: + title: "Original gene IDs" + description: Original gene IDs as stated in the input (provided or downloaded) datasets + stability_score: + title: "Stability score" + description: "Final stability score : the lower, the better" + format: "{:,.6f}" + scale: "RdYlGn-rev" + normfinder_stability_value_normalised: + title: "Normalised Normfinder score" + description: Quantile normalised (among candidate genes) stability value as computed by Normfinder + format: "{:,.6f}" + scale: "PRGn-rev" + genorm_m_measure_normalised: + title: "Normalised Genorm score" + description: Quantile normalised (among candidate genes) M-measure as computed by Genorm + format: "{:,.6f}" + scale: "PRGn-rev" + ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples)" + description: Ratio of samples in which the gene is not represented, excluding samples with particularly low overall gene count. + coefficient_of_variation_normalised: + title: "Normalised CV" + description: Quantile normalised (among candidate genes) coefficient of variation ( std(expression) / mean(expression) ) across all samples. + format: "{:,.6f}" + scale: "PRGn-rev" + robust_coefficient_of_variation_median_normalised: + title: "Normalised RCVm" + description: Quantile normalised (among candidate genes) robust coefficient of variation on median of the expression across all samples. + format: "{:,.4f}" + scale: "PRGn-rev" + coefficient_of_variation: + title: "CV" + description: Coefficient of variation ( std(expression) / mean(expression) ) across all samples. + format: "{:,.6f}" + robust_coefficient_of_variation_median: + title: "RCVm" + description: Robust coefficient of variation on median of the expression across all samples. + format: "{:,.4f}" + normfinder_stability_value: + title: "Normfinder stability value " + description: Stability value as computed by Normfinder + format: "{:,.6f}" + genorm_m_measure: + title: "Genorm M-measure" + description: M-measure as computed by Genorm + format: "{:,.6f}" + mean: + title: "Average" + description: Average expression across all samples. + format: "{:,.4f}" + standard_deviation: + title: "Standard deviation" + description: Standard deviation of the expression across all samples. + format: "{:,.6f}" + median: + title: "Median" + description: Median expression across all samples. + format: "{:,.4f}" + median_absolute_deviation: + title: "MAD" + description: Median absolute deviation of the expression across all samples. + format: "{:,.4f}" + expression_level_status: + title: "Expression level" + description: "Indication about the average gene expression level across all samples (compared to the whole pool of genes). Expression in [0, 0.05]: Very low expression. Expression in [0.05, 0.1]: Low expression. Expression in [0.1, 0.9]: Medium range. Expression in [0.9, 0.95]: High expression. Expression in [0.95, 1]: Very high expression." + cond_formatting_rules: + very_high: + - s_eq: "Very high expression" + high: + - s_eq: "High expression" + medium: + - s_eq: "Medium range" + low: + - s_eq: "Low expression" + very_low: + - s_eq: "Very low expression" + rnaseq_coefficient_of_variation: + title: "Var coeff [RNA-seq only]" + description: Coefficient of variation ( std(expression) / mean(expression) ) across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_robust_coefficient_of_variation_median: + title: "RCVm [RNA-seq only]" + description: Robust coefficient of variation on median of the expression across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_mean: + title: "Average [RNA-seq only]" + description: Average expression across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_standard_deviation: + title: "Std [RNA-seq only]" + description: Standard deviation of the expression across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_median: + title: "Median [RNA-seq only]" + description: Median expression across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_median_absolute_deviation: + title: "MAD [RNA-seq only]" + description: Median absolute deviation of the expression across RNA-Seq samples. + format: "{:,.4f}" + rnaseq_expression_level_status: + title: "Expression level [RNA-seq only]" + description: "Indication about the average gene expression level across RNA-seq samples (compared to the whole pool of genes). Expression in [0, 0.05]: Very low expression. Expression in [0.05, 0.1]: Low expression. Expression in [0.1, 0.9]: Medium range. Expression in [0.9, 0.95]: High expression. Expression in [0.95, 1]: Very high expression." + cond_formatting_rules: + very_high: + - s_eq: "Very high expression" + high: + - s_eq: "High expression" + medium: + - s_eq: "Medium range" + low: + - s_eq: "Low expression" + very_low: + - s_eq: "Very low expression" + microarray_coefficient_of_variation: + title: "Var coeff [Microarray only]" + description: Coefficient of variation ( std(expression) / mean(expression) ) across Microarray samples. + format: "{:,.4f}" + microarray_robust_coefficient_of_variation_median: + title: "RCVm [Microarray only]" + description: Robust coefficient of variation on median of the expression across Microarray samples. + format: "{:,.4f}" + microarray_mean: + title: "Average [Microarray only]" + description: Average expression across Microarray samples. + format: "{:,.4f}" + microarray_standard_deviation: + title: "Std [Microarray only]" + description: Standard deviation of the expression across Microarray samples. + format: "{:,.4f}" + microarray_median: + title: "Median [Microarray only]" + description: Median expression across Microarray samples. + format: "{:,.4f}" + microarray_median_absolute_deviation: + title: "MAD [Microarray only]" + description: Median absolute deviation of the expression across Microarray samples. + format: "{:,.4f}" + microarray_expression_level_status: + title: "Expression level [Microarray only]" + description: "Indication about the average gene expression level across Microarray samples (compared to the whole pool of genes). Expression in [0, 0.05]: Very low expression. Expression in [0.05, 0.1]: Low expression. Expression in [0.1, 0.9]: Medium range. Expression in [0.9, 0.95]: High expression. Expression in [0.95, 1]: Very high expression." + cond_formatting_rules: + very_high: + - s_eq: "Very high expression" + high: + - s_eq: "High expression" + medium: + - s_eq: "Medium range" + low: + - s_eq: "Low expression" + very_low: + - s_eq: "Very low expression" + ratio_nulls_in_all_samples: + title: "Ratio null values (all samples)" + description: Ratio of samples in which the gene is not represented. + ratio_zeros: + title: "Ratio zero values" + description: Ratio of samples in which the gene has a zero value. + rnaseq_ratio_nulls_in_all_samples: + title: "Ratio null values (all samples) [RNA-seq only]" + description: Ratio of RNA-Seq samples in which the gene is not represented. + rnaseq_ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples) [RNA-seq only]" + description: Ratio of RNA-Seq samples in which the gene is not represented, excluding samples with particularly low overall gene count. + rnaseq_ratio_zeros: + title: "Ratio zero values [RNA-seq only]" + description: Ratio of RNA-Seq samples in which the gene has a zero value. + microarray_ratio_nulls_in_all_samples: + title: "Ratio null values (all samples) [Microarray only]" + description: Ratio of Microarray samples in which the gene is not represented. + microarray_ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples) [Microarray only]" + description: Ratio of Microarray samples in which the gene is not represented, excluding samples with particularly low overall gene count. + microarray_ratio_zeros: + title: "Ratio zero values [Microarray only]" + description: Ratio of Microarray samples in which the gene has a zero value. + + expr_distrib_most_stable_genes_template: + section_name: "Normalised count distributions" + file_format: "csv" + pconfig: + sort_samples: false + xlab: Expression + ylab: Gene + description: "Distribution of normalised gene expression (between 0 and 1) across samples for the most stable genes. Only the NB_GENES most stable genes are shown and genes are ranked from the most stable to the least stable." + plot_type: "boxplot" + + gene_statistics: + section_name: "Descriptive statistics - All genes" + parent_id: stats + parent_name: "Statistics" + parent_description: "Various statistics at the gene or sample level" + file_format: "csv" + description: Distribution of descriptive statistics for all genes. + plot_type: "violin" + headers: + stability_score: + title: "Stability score" + color: "rgb(186,43,32)" + coefficient_of_variation_normalised: + title: "Normalised CV" + color: "rgb(64, 122, 22)" + robust_coefficient_of_variation_median_normalised: + title: "Normalised RCVm" + color: "rgb(64, 122, 22)" + normfinder_stability_value_normalised: + title: "Normalised Normfinder score" + color: "rgb(64, 122, 22)" + genorm_m_measure_normalised: + title: "Normalised Genorm score" + color: "rgb(64, 122, 22)" + coefficient_of_variation: + title: "CV" + color: "rgb(26, 167, 178)" + robust_coefficient_of_variation_median: + title: "RCVm" + color: "rgb(26, 167, 178)" + normfinder_stability_value: + title: "Normfinder stability value " + color: "rgb(26, 167, 178)" + genorm_m_measure: + title: "Genorm M-measure" + color: "rgb(26, 167, 178)" + mean: + title: "Average" + color: "rgb(26, 167, 178)" + standard_deviation: + title: "Standard deviation" + color: "rgb(26, 167, 178)" + median: + title: "Median" + color: "rgb(26, 167, 178)" + median_absolute_deviation: + title: "MAD" + color: "rgb(26, 167, 178)" + rnaseq_coefficient_of_variation: + title: "Var coeff [RNA-seq only]" + color: "rgb(140, 50, 76)" + rnaseq_robust_coefficient_of_variation_median: + title: "RCVm [RNA-seq only]" + color: "rgb(140, 50, 76)" + rnaseq_mean: + title: "Average [RNA-seq only]" + color: "rgb(140, 50, 76)" + rnaseq_standard_deviation: + title: "Std [RNA-seq only]" + color: "rgb(140, 50, 76)" + rnaseq_median: + title: "Median [RNA-seq only]" + color: "rgb(140, 50, 76)" + rnaseq_median_absolute_deviation: + title: "MAD [RNA-seq only]" + color: "rgb(140, 50, 76)" + microarray_coefficient_of_variation: + title: "Var coeff [Microarray only]" + color: "rgb(27, 83, 73)" + microarray_robust_coefficient_of_variation_median: + title: "RCVm [Microarray only]" + color: "rgb(27, 83, 73)" + microarray_mean: + title: "Average [Microarray only]" + color: "rgb(27, 83, 73)" + microarray_standard_deviation: + title: "Std [Microarray only]" + color: "rgb(27, 83, 73)" + microarray_median: + title: "Median [Microarray only]" + color: "rgb(27, 83, 73)" + microarray_median_absolute_deviation: + title: "MAD [Microarray only]" + color: "rgb(27, 83, 73)" + ratio_nulls_in_all_samples: + title: "Ratio null values (all samples)" + color: "rgb(106, 78, 193)" + ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples)" + color: "rgb(106, 78, 193)" + ratio_zeros: + title: "Ratio zero values" + color: "rgb(106, 78, 193)" + rnaseq_ratio_nulls_in_all_samples: + title: "Ratio null values (all samples) [RNA-seq only]" + color: "rgb(106, 78, 193)" + rnaseq_ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples) [RNA-seq only]" + color: "rgb(106, 78, 193)" + rnaseq_ratio_zeros: + title: "Ratio zero values [RNA-seq only]" + color: "rgb(106, 78, 193)" + microarray_ratio_nulls_in_all_samples: + title: "Ratio null values (all samples) [Microarray only]" + color: "rgb(106, 78, 193)" + microarray_ratio_nulls_in_valid_samples: + title: "Ratio null values (valid samples) [Microarray only]" + color: "rgb(106, 78, 193)" + microarray_ratio_zeros: + title: "Ratio zero values [Microarray only]" + color: "rgb(106, 78, 193)" + + skewness: + section_name: "Count skewness" + parent_id: stats + parent_name: "Statistics" + parent_description: "Various statistics at the gene or sample level" + file_format: "csv" + pconfig: + sort_samples: false + #xmin: 0 + #xmax: 1 + xlab: Skewness + ylab: Dataset + description: Distribution of count skewness across samples, displayed dataset per dataset. + plot_type: "boxplot" + + ratio_zeros: + section_name: "Proportion of zeros" + parent_id: stats + parent_name: "Statistics" + parent_description: "Various statistics at the gene or sample level" + file_format: "csv" + pconfig: + sort_samples: false + #xmin: 0 + #xmax: 1 + xlab: Proportion of zeros + ylab: Dataset + description: Distribution of zeros across samples, displayed dataset per dataset. + plot_type: "boxplot" + + ratio_nulls: + section_name: "Proportion of missing values" + parent_id: stats + parent_name: "Statistics" + parent_description: "Various statistics at the gene or sample level" + file_format: "csv" + pconfig: + sort_samples: false + #xmin: 0 + #xmax: 1 + xlab: Proportion of null values + ylab: Dataset + description: Distribution of missing values (including genes not present) across samples, displayed dataset per dataset. + plot_type: "boxplot" + + null_values_filter: + section_name: "Filter on null values" + parent_id: filtering + parent_name: "Sample filtering" + parent_description: "Proportion of samples filtered out, relatively to the the variable observed" + file_format: "csv" + pconfig: + sort_samples: true + tt_decimals: 0 + cpswitch: false # show the 'Counts / Percentages' switch + cpswitch_c_active: true # show counts per default + stacking: "relative" + description: Effect of filter on ratio of null (missing) values + categories: + kept: + name: "Nb samples kept" + color: "#2ABF96" + rejected: + name: "Nb samples rejected" + color: "#38B4F2" + plot_type: "barplot" + + zero_values_filter: + section_name: "Filter on zero values" + parent_id: filtering + parent_name: "Sample filtering" + parent_description: "Proportion of samples filtered out, relatively to the the variable observed" + file_format: "csv" + pconfig: + sort_samples: true + tt_decimals: 0 + cpswitch: false # show the 'Counts / Percentages' switch + cpswitch_c_active: true # show counts per default + stacking: "relative" + description: Effect of filter on ratio of zero values + categories: + kept: + name: "Nb samples kept" + color: "#2ABF96" + rejected: + name: "Nb samples rejected" + color: "#38B4F2" + plot_type: "barplot" + + id_mapping_stats: + section_name: "Gene ID mapping statistics" + parent_id: idmapping + parent_name: "ID mapping" + parent_description: "Information about the ID mapping" + file_format: "csv" + pconfig: + sort_samples: true + tt_decimals: 0 + cpswitch: true # show the 'Counts / Percentages' switch + cpswitch_c_active: true # show counts per default + stacking: "relative" + description: Statistics of gene ID mapping, dataset per dataset + categories: + final: + name: "Nb final gene IDs" + color: "#2ABF96" + merged: + name: "Nb gene IDs merged with other IDs" + color: "#38B4F2" + not_valid: + name: "Nb rare gene IDs removed" + color: "#F2C038" + unmapped: + name: "Nb unmapped gene IDs" + color: "#E3224A" + plot_type: "barplot" + + total_gene_id_occurrence_quantiles: + section_name: "Distribution of gene ID occurrence quantiles" + parent_id: idmapping + parent_name: "ID mapping" + parent_description: "Information about the ID mapping" + file_format: "csv" + pconfig: + categories: true + #ymax: 1.1 + #ymin: -0.1 + #y_lines: + # - value: 1 + # color: "#ff0000" + # width: 2 + # dash: "dash" + # label: "Threshold" + description: Quantiles of the total number of occurrences of gene IDs across all datasets. Quantile values were sorted from greatest to least. + plot_type: "linegraph" + helptext: Gene IDs can be present or absent in the datasets. For each gene ID, the total number of occurrences across all datasets was calculated and quantile values were computed from these totals. + + eatlas_selected_experiments_metadata: + section_name: "Selected" + parent_id: eatlas + parent_name: "Expression Atlas datasets" + parent_description: "Information about the Expression Atlas datasets processed in the analysis" + file_format: "tsv" + no_violin: true + description: Metadata of selected Expression Atlas datasets corresponding to the provided species (and optionally the provided keywords) + plot_type: "table" + + eatlas_all_experiments_metadata: + section_name: "All datasets" + parent_id: eatlas + parent_name: "Expression Atlas datasets" + parent_description: "Information about the Expression Atlas datasets processed in the analysis" + file_format: "tsv" + no_violin: true + description: Metadata of all Expression Atlas datasets corresponding to the provided species + plot_type: "table" + + eatlas_failure_reasons: + section_name: "Failure reasons" + parent_id: eatlas + parent_name: "Expression Atlas datasets" + parent_description: "Information about the Expression Atlas datasets processed in the analysis" + file_format: "csv" + no_violin: true + description: Reasons of failure during download of Expression Atlas datasets + plot_type: "table" + + eatlas_warning_reasons: + section_name: "Warnings" + parent_id: eatlas + parent_name: "Expression Atlas datasets" + parent_description: "Information about the Expression Atlas datasets processed in the analysis" + file_format: "csv" + no_violin: true + description: Warnings during download of Expression Atlas datasets + plot_type: "table" + + geo_selected_experiments_metadata: + section_name: "Selected" + parent_id: geo + parent_name: "GEO dataset metadata" + parent_description: "Information about the GEO datasets processed in the analysis" + file_format: "tsv" + no_violin: true + description: Metadata of selected GEO datasets corresponding to the provided species (and optionally the provided keywords) + plot_type: "table" + + geo_all_experiments_metadata: + section_name: "All datasets" + parent_id: geo + parent_name: "GEO datasets" + parent_description: "Information about the GEO datasets processed in the analysis" + file_format: "tsv" + no_violin: true + description: Metadata of all GEO datasets corresponding to the provided species + plot_type: "table" + + geo_rejected_experiments_metadata: + section_name: "Rejected GEO datasets" + parent_id: geo + parent_name: "GEO datasets" + parent_description: "Information about the GEO datasets processed in the analysis" + file_format: "tsv" + no_violin: true + description: Metadata of all GEO datasets which were rejected + plot_type: "table" + + geo_failure_reasons: + section_name: "Failure reasons" + parent_id: geo + parent_name: "GEO datasets" + parent_description: "Information about the GEO datasets processed in the analysis" + file_format: "csv" + no_violin: true + description: Reasons of failure during download of GEO datasets + plot_type: "table" + + geo_warning_reasons: + section_name: "Warnings" + parent_id: geo + parent_name: "GEO datasets" + parent_description: "Information about the GEO datasets processed in the analysis" + file_format: "csv" + no_violin: true + description: Warnings during download of GEO datasets + plot_type: "table" + + id_cleaning_failure_reasons: + section_name: "Gene ID cleaning failure reasons" + parent_id: idmapping + parent_name: "ID mapping" + parent_description: "Information about the ID mapping" + file_format: "tsv" + no_violin: true + description: Reasons of failure during gene ID cleaning + plot_type: "table" + + renaming_warning_reasons: + section_name: "Gene renaming warning reasons" + parent_id: idmapping + parent_name: "ID mapping" + parent_description: "Information about the ID mapping" + file_format: "tsv" + no_violin: true + description: Reasons of warning during gene ID renaming. You can further investigate ID mapping issues on the g:Profiler website at https://biit.cs.ut.ee/gprofiler/convert + plot_type: "table" + + renaming_failure_reasons: + section_name: "Gene renaming failure reasons" + parent_id: idmapping + parent_name: "ID mapping" + parent_description: "Information about the ID mapping" + file_format: "tsv" + no_violin: true + description: Reasons of failure during gene ID renaming + plot_type: "table" + + normalisation_failure_reasons: + section_name: "Failure reasons" + parent_id: normalisation + parent_name: "Normalisation" + parent_description: "Information about the normalisation" + file_format: "tsv" + no_violin: true + description: Reasons of failure during Normalisation (DESeq2 or edgeR) + plot_type: "table" + + normalisation_warning_reasons: + section_name: "Warning reasons" + parent_id: normalisation + parent_name: "Normalisation" + parent_description: "Information about the normalisation" + file_format: "tsv" + no_violin: true + description: Reasons of failure during Normalisation (DESeq2 or edgeR) + plot_type: "table" + +custom_content: + order: + - gene_statistics + - skewness + - ratio_zeros + - ratio_nulls + - null_values_filter + - zero_values_filter + - id_mapping_stats + - total_gene_id_occurrence_quantiles + - eatlas_selected_experiments_metadata + - eatlas_all_experiments_metadata + - eatlas_failure_reasons + - eatlas_warning_reasons + - geo_selected_experiments_metadata + - geo_all_experiments_metadata + - geo_rejected_experiments_metadata + - geo_failure_reasons + - geo_warning_reasons + - id_cleaning_failure_reasons + - renaming_warning_reasons + - renaming_failure_reasons + - normalisation_failure_reasons + - normalisation_warning_reasons + +sp: + ranked_most_stable_genes_summary_template: + fn: "*SECTION.most_stable_genes_summary.csv" + max_filesize: 5000000 # 5MB + expr_distrib_most_stable_genes_template: + fn: "*SECTION.most_stable_genes_transposed_counts.csv" + max_filesize: 50000000 # 50MB + gene_statistics: + fn: "*all_genes_summary.csv" + max_filesize: 50000000 # 50MB + id_mapping_stats: + fn: "*id_mapping_stats.csv" + null_values_filter: + fn: "*missing_values_filter_stats.csv" + zero_values_filter: + fn: "*zero_values_filter_stats.csv" + total_gene_id_occurrence_quantiles: + fn: "*total_gene_id_occurrence_quantiles.csv" + skewness: + fn: "*skewness.transposed.csv" + ratio_zeros: + fn: "*ratio_zeros.transposed.csv" + ratio_nulls: + fn: "*ratio_nulls.transposed.csv" + eatlas_selected_experiments_metadata: + fn: "*selected_experiments.metadata.tsv" + eatlas_all_experiments_metadata: + fn: "*species_experiments.metadata.tsv" + eatlas_failure_reasons: + fn: "*eatlas_failure_reasons.csv" + eatlas_warning_reasons: + fn: "*eatlas_warning_reasons.csv" + geo_selected_experiments_metadata: + fn: "*geo_selected_datasets.metadata.tsv" + geo_all_experiments_metadata: + fn: "*geo_all_datasets.metadata.tsv" + geo_rejected_experiments_metadata: + fn: "*geo_rejected_datasets.metadata.tsv" + geo_failure_reasons: + fn: "*geo_failure_reasons.csv" + geo_warning_reasons: + fn: "*geo_warning_reasons.csv" + id_cleaning_failure_reasons: + fn: "*id_cleaning_failure_reasons.tsv" + renaming_warning_reasons: + fn: "*renaming_warning_reasons.tsv" + renaming_failure_reasons: + fn: "*renaming_failure_reasons.tsv" + normalisation_failure_reasons: + fn: "*normalisation_failure_reasons.csv" + normalisation_warning_reasons: + fn: "*normalisation_warning_reasons.csv" diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 7addef1b..565c88e2 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,7 +1,6 @@ report_comment: > - This report has been generated by the nf-core/stableexpression - analysis pipeline. For information about how to interpret these results, please see the - documentation. + This report has been generated by the nf-core/stableexpression analysis pipeline. For information about how to interpret these results, please see the documentation. + report_section_order: "nf-core-stableexpression-methods-description": order: -1000 @@ -12,4 +11,24 @@ report_section_order: export_plots: true +run_modules: + - custom_content + disable_version_detection: true + +max_table_rows: 100000 + +table_cond_formatting_colours: + - first: "#ffd700" + - second: "#C0C0C0" + - third: "#CD7F32" + - between_fourth_and_tenth: "#468F8F" + - very_low: "#337ab7" + - low: "#5bc0de" + - medium: "#5cb85c" + - high: "#f0ad4e" + - very_high: "#d9534f" + +#violin_downsample_after: 10000 + +log_filesize_limit: 10000000000 # 10GB diff --git a/assets/nf-core-stableexpression_logo_light.png b/assets/nf-core-stableexpression_logo_light.png index af2eddfd..d1af4a9c 100644 Binary files a/assets/nf-core-stableexpression_logo_light.png and b/assets/nf-core-stableexpression_logo_light.png differ diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv deleted file mode 100644 index 5f653ab7..00000000 --- a/assets/samplesheet.csv +++ /dev/null @@ -1,3 +0,0 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, diff --git a/assets/schema_datasets.json b/assets/schema_datasets.json new file mode 100644 index 00000000..fa320e10 --- /dev/null +++ b/assets/schema_datasets.json @@ -0,0 +1,41 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/assets/schema_datasets.json", + "title": "nf-core/stableexpression pipeline - params.schema_datasets schema", + "description": "Schema for the file provided with params.datasets", + "type": "array", + "items": { + "type": "object", + "properties": { + "counts": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.(csv|tsv|dat)$", + "errorMessage": "You must provide a count dataset file" + }, + "design": { + "type": "string", + "format": "file-path", + "schema": "assets/schema_design.json", + "exists": true, + "pattern": "^\\S+\\.(csv|tsv|dat)$", + "errorMessage": "You must provide a design file", + "meta": ["design"] + }, + "platform": { + "type": "string", + "errorMessage": "You must specify the platform of the dataset", + "enum": ["rnaseq", "microarray"], + "meta": ["platform"] + }, + "normalised": { + "type": "boolean", + "description": "Specify whether the dataset is already normalised", + "errorMessage": "You must specify whether the dataset is already normalised (true or false)", + "meta": ["normalised"] + } + }, + "required": ["counts", "design", "platform", "normalised"] + } +} diff --git a/assets/schema_design.json b/assets/schema_design.json new file mode 100644 index 00000000..dc1e4b87 --- /dev/null +++ b/assets/schema_design.json @@ -0,0 +1,23 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/assets/schema_design.json", + "title": "nf-core/stableexpression pipeline - design schema", + "description": "Schema for the design file provided in the design column of the params.datasets CSV / TSV file", + "type": "array", + "items": { + "type": "object", + "properties": { + "sample": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Sample name must be provided and cannot contain spaces" + }, + "condition": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Condition name must be provided and cannot contain spaces" + } + }, + "required": ["sample", "condition"] + } +} diff --git a/assets/schema_gene_id_mapping.json b/assets/schema_gene_id_mapping.json new file mode 100644 index 00000000..fc537199 --- /dev/null +++ b/assets/schema_gene_id_mapping.json @@ -0,0 +1,23 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/assets/schema_gene_id_mapping.json", + "title": "nf-core/stableexpression pipeline - custom mappings schema", + "description": "Schema for the file provided with the params.gene_id_mapping CSV / TSV file", + "type": "array", + "items": { + "type": "object", + "properties": { + "original_gene_id": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "You must provide a column for original gene IDs." + }, + "gene_id": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "You must provide a column for mapped IDs." + } + }, + "required": ["original_gene_id", "gene_id"] + } +} diff --git a/assets/schema_gene_length.json b/assets/schema_gene_length.json new file mode 100644 index 00000000..b395cea0 --- /dev/null +++ b/assets/schema_gene_length.json @@ -0,0 +1,23 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/assets/schema_gene_length.json", + "title": "nf-core/stableexpression pipeline - custom mappings schema", + "description": "Schema for the file provided with in the design column of the params.gene_length CSV file", + "type": "array", + "items": { + "type": "object", + "properties": { + "gene_id": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "You must provide a column for original gene IDs." + }, + "length": { + "type": "integer", + "minimum": 0, + "errorMessage": "You must provide a column for gene lengths." + } + }, + "required": ["gene_id", "length"] + } +} diff --git a/assets/schema_gene_metadata.json b/assets/schema_gene_metadata.json new file mode 100644 index 00000000..d3faad8c --- /dev/null +++ b/assets/schema_gene_metadata.json @@ -0,0 +1,28 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/master/assets/schema_gene_metadata.json", + "title": "nf-core/stableexpression pipeline - custom mappings schema", + "description": "Schema for the file provided with the params.gene_metadata CSV / TSV file", + "type": "array", + "items": { + "type": "object", + "properties": { + "gene_id": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "You must provide a column for mapped IDs." + }, + "name": { + "type": "string", + "pattern": "^[^,]+$", + "errorMessage": "You must provide a column for gene names." + }, + "description": { + "type": "string", + "pattern": "^[^,]+$", + "errorMessage": "You must provide a column for gene descriptions." + } + }, + "required": ["gene_id", "name", "description"] + } +} diff --git a/assets/schema_input.json b/assets/schema_input.json deleted file mode 100644 index 8618f24a..00000000 --- a/assets/schema_input.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/main/assets/schema_input.json", - "title": "nf-core/stableexpression pipeline - params.input schema", - "description": "Schema for the file provided with params.input", - "type": "array", - "items": { - "type": "object", - "properties": { - "sample": { - "type": "string", - "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces", - "meta": ["id"] - }, - "fastq_1": { - "type": "string", - "format": "file-path", - "exists": true, - "pattern": "^([\\S\\s]*\\/)?[^\\s\\/]+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" - }, - "fastq_2": { - "type": "string", - "format": "file-path", - "exists": true, - "pattern": "^([\\S\\s]*\\/)?[^\\s\\/]+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" - } - }, - "required": ["sample", "fastq_1"] - } -} diff --git a/bin/aggregate_results.py b/bin/aggregate_results.py new file mode 100755 index 00000000..fd536674 --- /dev/null +++ b/bin/aggregate_results.py @@ -0,0 +1,509 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +import yaml +from common import write_float_csv + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# outfile names +ALL_GENE_SUMMARY_OUTFILENAME = "all_genes_summary.csv" +SUMMARY_OUTFILENAME_SUFFIX = "most_stable_genes_summary.csv" +COUNTS_OUTFILENAME_SUFFIX = "most_stable_genes_transposed_counts.csv" +CUSTOM_CONTENT_MULTIQC_CONFIG_FILE = "custom_content_multiqc_config.yaml" + +# quantile intervals +NB_EXPRESSION_QUANTILES = 100 +NB_TOP_GENES_TO_SHOW_IN_BOX_PLOTS = 25 + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Get statistics from count data for each gene" + ) + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--target-genes", + type=str, + nargs="+", + dest="target_genes", + default=[], + help="File containing target genes", + ) + parser.add_argument( + "--stats-with-scores", + type=Path, + nargs="+", + dest="stat_score_files", + required=True, + help="Files containing statistics for all genes and stability scores by candidate genes, one per section", + ) + parser.add_argument( + "--multiqc-config", + type=Path, + dest="multiqc_config", + required=True, + help="MultiQC config file for custom content", + ) + parser.add_argument( + "--platform-stats", + type=Path, + dest="platform_stat_files", + nargs="+", + help="File containing base statistics for all genes and for all datasets for a specific platform", + ) + parser.add_argument( + "--metadata", + type=str, + dest="metadata_files", + help="Metadata file", + ) + parser.add_argument( + "--mappings", type=str, dest="mapping_files", help="Mapping file" + ) + return parser.parse_args() + + +def parse_stat_score_file(file: Path) -> pl.DataFrame: + return pl.read_csv(file).with_columns( + pl.col(config.GENE_ID_COLNAME).cast(pl.String()) + ) + + +def get_non_empty_dataframes(files: list[Path]) -> list[pl.DataFrame]: + dfs = [pl.read_csv(file) for file in files] + return [df for df in dfs if not df.is_empty()] + + +def cast_cols_to_string(df: pl.DataFrame) -> pl.DataFrame: + return df.select( + [pl.col(column).cast(pl.String) for column in df.collect_schema().names()] + ) + + +def concat_cast_to_string_and_drop_duplicates(files: list[Path]) -> pl.DataFrame: + """Concatenate DataFrames, cast all columns to String, and drop duplicates. + + The first step is to concatenate the DataFrames. Then, the dataframe is cast + to String to ensure that all columns have the same data type. Finally, duplicate + rows are dropped. + """ + dfs = get_non_empty_dataframes(files) + dfs = [cast_cols_to_string(df) for df in dfs] + concat_df = pl.concat(dfs) + # dropping duplicates + # casting all columns to String + return concat_df.unique() + + +def cast_count_columns_to_float(df: pl.DataFrame) -> pl.DataFrame: + return df.select( + pl.col(config.GENE_ID_COLNAME), + pl.exclude(config.GENE_ID_COLNAME).cast(pl.Float64), + ) + + +def join_data_on_gene_id(stat_df: pl.DataFrame, *dfs: pl.DataFrame) -> pl.DataFrame: + """Merge the statistics dataframe with the metadata dataframe and the mapping dataframe.""" + # we need to ensure that the index of stat_df are strings + for df in dfs: + stat_df = stat_df.join(df, on=config.GENE_ID_COLNAME, how="left") + return stat_df + + +def get_counts(file: Path) -> pl.DataFrame: + # sorting dataframe (necessary to get consistent output) + return pl.read_parquet(file).sort(config.GENE_ID_COLNAME, descending=False) + + +def get_metadata(metadata_files: list[Path]) -> pl.DataFrame | None: + """Retrieve and concatenate metadata from a list of metadata files.""" + if not metadata_files: + return None + return concat_cast_to_string_and_drop_duplicates(metadata_files) + + +def get_mappings(mapping_files: list[Path]) -> pl.DataFrame | None: + if not mapping_files: + return None + concat_df = concat_cast_to_string_and_drop_duplicates(mapping_files) + # group by new gene IDs and gets the lis + # convert the list column to a string representation + # separate the original gene IDs with a semicolon + return concat_df.group_by(config.GENE_ID_COLNAME).agg( + pl.col(config.ORIGINAL_GENE_ID_COLNAME) + .unique() + .sort() + .str.join(";") + .alias(config.ORIGINAL_GENE_IDS_COLNAME) + ) + + +def get_status(quantile_interval: int) -> str: + """Return the expression level status of the gene given its quantile interval.""" + if NB_EXPRESSION_QUANTILES - 5 <= quantile_interval: + return "Very high expression" + elif ( + NB_EXPRESSION_QUANTILES - 10 <= quantile_interval < NB_EXPRESSION_QUANTILES - 5 + ): + return "High expression" + elif 4 < quantile_interval <= 9: + return "Low expression" + elif quantile_interval <= 4: + return "Very low expression" + else: + return "Medium range" + + +def add_expression_level_status(df: pl.DataFrame) -> pl.DataFrame: + logger.info("Adding expression level status") + mapping_dict = { + quantile_interval: get_status(quantile_interval) + for quantile_interval in range(NB_EXPRESSION_QUANTILES) + } + return df.with_columns( + pl.col(config.EXPRESSION_LEVEL_QUANTILE_INTERVAL_COLNAME) + .replace_strict(mapping_dict) + .alias(config.EXPRESSION_LEVEL_STATUS_COLNAME) + ) + + +def complement_gene_summary_table( + stat_summary_df: pl.DataFrame, *dfs: pl.DataFrame +) -> pl.DataFrame: + """ + Add various metadata to statistics summary. + """ + # add gene name, description and original gene IDs to statistics summary + stat_summary_df = join_data_on_gene_id(stat_summary_df, *dfs) + stat_summary_df = add_expression_level_status(stat_summary_df) + return stat_summary_df + + +def get_most_stable_genes_counts( + log_count_df: pl.DataFrame, stat_summary_df: pl.DataFrame +) -> pl.DataFrame: + # getting list of top stable genes with their order + top_genes_with_order = ( + stat_summary_df.head(NB_TOP_GENES_TO_SHOW_IN_BOX_PLOTS) + .select(config.GENE_ID_COLNAME) + .with_row_index("sort_order") + ) + + # join to get only existing genes and maintain order + sorted_transposed_counts_df = log_count_df.join( + top_genes_with_order, on=config.GENE_ID_COLNAME, how="inner" + ).sort("sort_order", descending=False) + + # get the actual gene names that were found (in order) + actual_gene_names = ( + sorted_transposed_counts_df.select(config.GENE_ID_COLNAME).to_series().to_list() + ) + return sorted_transposed_counts_df.drop( + ["sort_order", config.GENE_ID_COLNAME] + ).transpose(column_names=actual_gene_names) + + +def format_multiqc_section( + section: str, nb_sections: int, template_dict: dict, found_target_genes: list[dict] +): + section_dict = dict(template_dict) + + parent_id = section.replace("_", " ") + parent_name = ( + f"{section.replace('_', ' ').capitalize()} / {nb_sections}: most stable genes" + ) + parent_description = ( + f"Most stable genes and distribution of their normalised counts for {section.replace('_', ' ')} / {nb_sections}" + + " (section 1 corresponding to the most expressed genes)" + ) + + additional_name = "" + if found_target_genes: + additional_names = [ + f"{d['target_gene']} ({d['gene']})" for d in found_target_genes + ] + additional_name = ". Comprises " + ", ".join(additional_names) + + section_dict["parent_id"] = parent_id + section_dict["parent_name"] = parent_name + additional_name + section_dict["parent_description"] = parent_description + + return section_dict + + +def format_multiqc_sp(section: str, template_dict: dict): + sp_dict = dict(template_dict) + sp_dict["fn"] = sp_dict["fn"].replace("SECTION", section) + return sp_dict + + +def format_genes(genes: list[str]): + # str.maketrans("", "", "-_.") makes a mapping table for str.translate() that + # removes all occurrences of any character in "-_." from the input string + # it's faster than re.sub + return pl.Series( + [gene.lower().translate(str.maketrans("", "", "-_.")).strip() for gene in genes] + ) + + +def search_target_genes(df: pl.DataFrame, target_genes: list[str]) -> list[dict]: + """ + Search for target genes in a DataFrame. + + Args: + df (pl.DataFrame): The DataFrame to search in. + target_genes (list[str]): The list of target genes to search for. + + Returns: + list[dict]: A list of dictionaries associating each found target gene with its corresponding gene ID in the datasets. + """ + + unique_gene_ids = set(df[config.GENE_ID_COLNAME].to_list()) + + if config.GENE_NAME_COLNAME in df.columns: + unique_gene_ids |= set(df[config.GENE_NAME_COLNAME].to_list()) + + if config.ORIGINAL_GENE_IDS_COLNAME in df.columns: + original_gene_ids = ( + df.select( + pl.col(config.ORIGINAL_GENE_IDS_COLNAME).str.split(by=",").explode() + ) + .to_series() + .to_list() + ) + unique_gene_ids |= set(original_gene_ids) + + # putting all unique gene IDs, gene names and original gene IDs into single list + all_unique_gene_ids = [gene for gene in unique_gene_ids if gene is not None] + + # formatting all gene IDs found + formated_gene_ids_df = pl.DataFrame({"gene": all_unique_gene_ids}).with_columns( + pl.col("gene") + .map_batches( + lambda x: format_genes(x), + return_dtype=pl.String, + ) + .alias("formatted_gene") + ) + + # formatting target genes + formated_target_genes_df = pl.DataFrame({"target_gene": target_genes}).with_columns( + pl.col("target_gene") + .map_batches( + lambda x: format_genes(x), + return_dtype=pl.String, + ) + .alias("formatted_gene") + ) + + return ( + formated_gene_ids_df.join( + formated_target_genes_df, on="formatted_gene", how="inner" + ) + .select(["target_gene", "gene"]) + .sort("target_gene") + .to_dicts() + ) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + # -------------------------------------------------- + # Parsing counts + # -------------------------------------------------- + + count_df = get_counts(args.count_file) + # reducing dataframe size (it is only used for plotting by MultiQC) + count_df = cast_count_columns_to_float(count_df) + + # -------------------------------------------------- + # Parsing statistics and scores, section by section + # -------------------------------------------------- + + stat_score_dfs = [] + sections = [] + for file in args.stat_score_files: + # the section name is at the beginning of the file name + section = file.name.split(".")[0] + df = parse_stat_score_file(file) + df = df.with_columns(pl.lit(section).alias(config.SECTION_COLNAME)) + stat_score_dfs.append(df) + sections.append(section) + + stat_score_df = pl.concat(stat_score_dfs) + + if stat_score_df.select(config.GENE_ID_COLNAME).is_duplicated().any(): + raise ValueError("Duplicate gene IDs found in statistics and scores files.") + + # sorting sections in the order (from 1 to ) + sections = sorted(sections, key=lambda section: int(section.split("_")[-1])) + + # -------------------------------------------------- + # Parsing MultiQC template config for custom content + # -------------------------------------------------- + + with open(args.multiqc_config, "r") as f: + multiqc_config = yaml.safe_load(f.read()) + + # putting template parts aside + ranking_dict = multiqc_config["custom_data"][ + "ranked_most_stable_genes_summary_template" + ] + ranking_sp_dict = multiqc_config["sp"]["ranked_most_stable_genes_summary_template"] + expr_distrib_dict = multiqc_config["custom_data"][ + "expr_distrib_most_stable_genes_template" + ] + expr_distrib_sp_dict = multiqc_config["sp"][ + "expr_distrib_most_stable_genes_template" + ] + other_sections = multiqc_config["custom_content"]["order"] + + del multiqc_config["custom_data"]["ranked_most_stable_genes_summary_template"] + del multiqc_config["sp"]["ranked_most_stable_genes_summary_template"] + del multiqc_config["custom_data"]["expr_distrib_most_stable_genes_template"] + del multiqc_config["sp"]["expr_distrib_most_stable_genes_template"] + del multiqc_config["custom_content"]["order"] + + # filling dynamically the number of genes to show in box plots + expr_distrib_dict["description"] = expr_distrib_dict["description"].replace( + "NB_GENES", str(NB_TOP_GENES_TO_SHOW_IN_BOX_PLOTS) + ) + + # -------------------------------------------------- + # Parsing statistics per platform + # -------------------------------------------------- + + platform_datasets_stat_dfs = [ + parse_stat_score_file(file) + for file in args.platform_stat_files + if file is not None + ] + + # -------------------------------------------------- + # Parsing metadata and mapping files + # -------------------------------------------------- + + metadata_files = ( + [Path(file) for file in args.metadata_files.split(" ")] + if args.metadata_files is not None + else [] + ) + mapping_files = ( + [Path(file) for file in args.mapping_files.split(" ")] + if args.mapping_files is not None + else [] + ) + + # parsing metadata and mapping files + metadata_df = get_metadata(metadata_files) + mapping_df = get_mappings(mapping_files) + optional_dfs = [df for df in [metadata_df, mapping_df] if df is not None] + + # -------------------------------------------------- + # Adding metadata, mapping and platform statistics information to gene summary table + # -------------------------------------------------- + + additional_data_dfs = optional_dfs + platform_datasets_stat_dfs + all_genes_summary_df = complement_gene_summary_table( + stat_score_df, *additional_data_dfs + ) + + logger.info(f"Exporting statistics of all genes to: {ALL_GENE_SUMMARY_OUTFILENAME}") + # sorting values in order to having consistent output + all_genes_summary_df.sort(by=config.GENE_ID_COLNAME).write_csv( + ALL_GENE_SUMMARY_OUTFILENAME, float_precision=config.CSV_FLOAT_PRECISION + ) + + # -------------------------------------------------- + # Getting summary table and counts for each section + # Adding new sections in MultiQC config for each new expression section + # -------------------------------------------------- + + nb_sections = len(sections) + new_mqc_config_sections = {} + new_mqc_config_sp = {} + + logger.info("Making new sections in the MultiQC config") + for section in sections: + # getting best candidates for this section + + section_df = ( + all_genes_summary_df.filter(pl.col("section") == section) + .drop("section") + .sort(config.STABILITY_SCORE_COLNAME, nulls_last=True, maintain_order=True) + ) + + found_target_genes = [] + if args.target_genes: + found_target_genes = search_target_genes(section_df, args.target_genes) + + section_most_stable_genes_counts_df = get_most_stable_genes_counts( + count_df, section_df + ) + + section_summary_outfile = f"{section}.{SUMMARY_OUTFILENAME_SUFFIX}" + write_float_csv(section_df, section_summary_outfile) + + section_counts_outfile = f"{section}.{COUNTS_OUTFILENAME_SUFFIX}" + write_float_csv(section_most_stable_genes_counts_df, section_counts_outfile) + + # making new sections in the MultiQC config + new_mqc_config_sections[f"genes_{section}"] = format_multiqc_section( + section, nb_sections, ranking_dict, found_target_genes + ) + new_mqc_config_sections[f"normalised_expr_distrib_{section}"] = ( + format_multiqc_section( + section, nb_sections, expr_distrib_dict, found_target_genes + ) + ) + new_mqc_config_sp[f"genes_{section}"] = format_multiqc_sp( + section, ranking_sp_dict + ) + new_mqc_config_sp[f"normalised_expr_distrib_{section}"] = format_multiqc_sp( + section, expr_distrib_sp_dict + ) + + # adding new sections + multiqc_config["custom_data"] = ( + new_mqc_config_sections | multiqc_config["custom_data"] + ) + # specifying the filenames linked to the new sections + multiqc_config["sp"] = new_mqc_config_sp | multiqc_config["sp"] + # specifying the section order + multiqc_config["custom_content"]["order"] = ( + list(new_mqc_config_sections.keys()) + other_sections + ) + + with open(CUSTOM_CONTENT_MULTIQC_CONFIG_FILE, "w") as f: + yaml.dump(multiqc_config, f, indent=4, sort_keys=False) + + logger.info("Done") + + +if __name__ == "__main__": + main() diff --git a/bin/clean_gene_ids.py b/bin/clean_gene_ids.py new file mode 100755 index 00000000..1e18c15b --- /dev/null +++ b/bin/clean_gene_ids.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from pathlib import Path + +import config +import polars as pl +from common import parse_count_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +################################################################## +# CONSTANTS +################################################################## + +CLEANED_COUNTS_SUFFIX = ".cleaned.parquet" + +FAILURE_REASON_FILE = "failure_reason.txt" + +################################################################## +# FUNCTIONS +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Rename gene IDs using mapped IDs") + parser.add_argument( + "--count-file", type=Path, required=True, help="Input file containing counts" + ) + return parser.parse_args() + + +def clean_ensembl_gene_id_versioning(df: pl.DataFrame): + """ + Clean Ensembl gene IDs by removing version numbers. + Remove the dot and the numbers after it in IDs like ENSG00000000003.17 + """ + return df.with_columns( + pl.when(pl.col(config.GENE_ID_COLNAME).str.starts_with("ENSG")) + .then(pl.col(config.GENE_ID_COLNAME).str.extract(r"^(ENSG[a-zA-Z0-9]+)", 1)) + .otherwise(pl.col(config.GENE_ID_COLNAME)) + .alias(config.GENE_ID_COLNAME) + ) + + +def clean_mirna_ids(df: pl.DataFrame): + """ + Clean miRNA IDs by removing the 5p / 3p identifier. + """ + return df.with_columns( + pl.when(pl.col(config.GENE_ID_COLNAME).str.contains(r"-[53]p$")) + .then(pl.col(config.GENE_ID_COLNAME).str.extract(r"^(.*?)-[53]p$")) + .otherwise(pl.col(config.GENE_ID_COLNAME)) + .alias(config.GENE_ID_COLNAME) + ) + + +################################################################## +# MAIN +################################################################## + + +def main(): + args = parse_args() + + logger.info(f"Converting IDs for count file {args.count_file.name}...") + + ############################################################# + # PARSING FILES + ############################################################# + + df = parse_count_table(args.count_file) + + if df.is_empty(): + msg = "COUNT FILE IS EMPTY" + logger.warning(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + sys.exit(0) + + try: + df = clean_ensembl_gene_id_versioning(df) + df = clean_mirna_ids(df) + except Exception as e: + msg = f"ERROR CLEANING IDS in count file {args.count_file.name}: {e}" + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + sys.exit(0) + + ############################################################# + # WRITING CLEANED COUNTS + ############################################################# + + logger.info("Writing count file with cleaned IDs") + count_outfile = args.count_file.with_name( + args.count_file.stem + CLEANED_COUNTS_SUFFIX + ) + df.write_parquet(count_outfile) + + +if __name__ == "__main__": + main() diff --git a/bin/collect_gene_ids.py b/bin/collect_gene_ids.py new file mode 100755 index 00000000..d4531444 --- /dev/null +++ b/bin/collect_gene_ids.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from collections import Counter +from pathlib import Path + +import config +from tqdm import tqdm + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +UNIQUE_GENE_IDS_OUTFILE = "unique_gene_ids.txt" +GENE_ID_OCCURRENCES_OUTFILE = "gene_id_occurrences.csv" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Collect gene IDs from count files") + parser.add_argument( + "--ids", type=str, dest="gene_id_files", required=True, help="Gene ID files" + ) + return parser.parse_args() + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + gene_id_files = [Path(file) for file in args.gene_id_files.split(" ")] + logger.info(f"Getting gene IDs from {len(gene_id_files)} files") + + unique_gene_ids = set() + counter = Counter() + for gene_id_file in tqdm(gene_id_files): + with open(gene_id_file, "r") as fin: + gene_ids = [line.strip() for line in fin] + unique_gene_ids.update(gene_ids) + counter.update(gene_ids) + + with open(UNIQUE_GENE_IDS_OUTFILE, "w") as fout: + fout.write("\n".join([str(gene_id) for gene_id in sorted(unique_gene_ids)])) + + with open(GENE_ID_OCCURRENCES_OUTFILE, "w") as fout: + fout.write( + f"{config.ORIGINAL_GENE_ID_COLNAME},{config.GENE_ID_COUNT_COLNAME}\n" + ) + for gene_id, count in sorted(counter.items()): + fout.write(f"{gene_id},{count}\n") + + +if __name__ == "__main__": + main() diff --git a/bin/collect_statistics.py b/bin/collect_statistics.py new file mode 100755 index 00000000..418adc28 --- /dev/null +++ b/bin/collect_statistics.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import pandas as pd + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Collect statistics") + parser.add_argument( + "--file", + type=Path, + required=True, + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + logger.info("Collecting statistics...") + # parsing file manually because it's not a standard CSV format + with open(args.file, "r") as f: + lines = f.readlines() + data = [line.strip().split(",") for line in lines] + + # getting max number of columns + max_nb_cols = max(len(row) for row in data) + # fill missing values with None + for row in data: + row += [None] * (max_nb_cols - len(row)) + + df = pd.DataFrame(data) + # the first item is the dataset name + df.set_index(df.columns[0], inplace=True) + + outfile = args.file.name.replace(".csv", ".transposed.csv") + logger.info(f"Saving statistics to {outfile}") + df.T.to_csv(outfile, index=False, header=True) + + +if __name__ == "__main__": + main() diff --git a/bin/common.py b/bin/common.py new file mode 100644 index 00000000..59e1bb7e --- /dev/null +++ b/bin/common.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def parse_header(file: Path, sep: str): + with open(file, "r") as fin: + header = fin.readline().strip().split(sep) + first_row = fin.readline().strip().split(sep) + if len(header) == len(first_row): + return header + elif len(header) == len(first_row) - 1: + return [config.GENE_ID_COLNAME] + header + else: + raise ValueError( + f"Header has length: {len(header)} while first row has length: {len(first_row)}" + ) + + +def parse_table(file: Path): + # parsing header first + if file.suffix in [".csv", ".tsv"]: + # parsing header manually + sep = "," if file.suffix == ".csv" else "\t" + header = parse_header(file, sep) + return pl.read_csv( + file, + separator=sep, + has_header=False, + skip_rows=1, + new_columns=header, + null_values=["NA", "N/A", "na", "n/a"], + ) + elif file.suffix == ".parquet": + return pl.read_parquet(file) + else: + raise ValueError(f"Unsupported file format: {file.suffix}") + + +def parse_count_table(file: Path): + df = parse_table(file) + first_col = df.columns[0] + # whatever the name of the first col, rename it to "gene_id" + return df.rename({first_col: config.GENE_ID_COLNAME}).select( + pl.col(config.GENE_ID_COLNAME).cast(pl.String()), + pl.exclude(config.GENE_ID_COLNAME).cast(pl.Float64()), + ) + + +def compute_log2(df: pl.DataFrame) -> pl.DataFrame: + """ + Compute log2 values. + """ + return df.select( + pl.col(config.GENE_ID_COLNAME), + (pl.exclude(config.GENE_ID_COLNAME) + 1).log(base=2), + ) + + +def export_parquet(df: pl.DataFrame, count_file: Path, suffix: str): + outfilename = count_file.with_suffix(suffix).name + logger.info(f"Exporting processed counts to: {outfilename}") + df.write_parquet(outfilename) + + +def write_float_csv(df: pl.DataFrame, outfilename: str): + df.write_csv(outfilename, float_precision=config.CSV_FLOAT_PRECISION) diff --git a/bin/compute_cpm.py b/bin/compute_cpm.py new file mode 100755 index 00000000..b2548896 --- /dev/null +++ b/bin/compute_cpm.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from pathlib import Path + +import config +import polars as pl +from common import compute_log2, export_parquet, parse_count_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +OUTFILE_SUFFIX = ".cpm.parquet" + +WARNING_REASON_FILE = "warning_reason.txt" +FAILURE_REASON_FILE = "failure_reason.txt" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Normalise data to CPM") + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + return parser.parse_args() + + +def calculate_cpm(df: pl.DataFrame) -> pl.DataFrame: + """ + Calculate CPM (Counts Per Million) from raw count data. + + Parameters: + ----------- + counts_df : polars.DataFrame + DataFrame with genes as rows and samples as columns + + Returns: + -------- + cpm_df : polars.DataFrame + DataFrame with CPM values + """ + # Calculate total counts per sample (column sums) + sums = df.select(pl.exclude(config.GENE_ID_COLNAME).sum()) + + # Calculate CPM: (count / total_counts) * 1,000,000 + count_columns = df.select(pl.exclude(config.GENE_ID_COLNAME)).columns + return df.select( + [pl.col(config.GENE_ID_COLNAME)] + + [(pl.col(col) / sums[col][0] * 1e6).alias(col) for col in count_columns] + ) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + logger.info("Parsing data") + + try: + count_df = parse_count_table(args.count_file) + + logger.info(f"Normalising {args.count_file.name}") + count_df = calculate_cpm(count_df) + + logger.info("Computing log2 values") + count_df = compute_log2(count_df) + + export_parquet(count_df, args.count_file, OUTFILE_SUFFIX) + + except Exception as e: + logger.error(f"Error occurred while normalising data: {e}") + msg = "UNEXPECTED ERROR" + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_dataset_statistics.py b/bin/compute_dataset_statistics.py new file mode 100755 index 00000000..48671561 --- /dev/null +++ b/bin/compute_dataset_statistics.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +from common import parse_count_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +KEY_TO_OUTFILE = {"skewness": "skewness.txt"} +FLOAT_PRECISION = 6 + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Compute general statistics from count data for each sample" + ) + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + return parser.parse_args() + + +def compute_dataset_statistics(df: pl.DataFrame) -> dict: + # sample count skewness + skewness = df.select(pl.exclude(config.GENE_ID_COLNAME).skew()).row(0) + return dict(skewness=list(skewness)) + + +def format_value(value: float) -> str: + return f"{value:.{FLOAT_PRECISION}f}" if value != 0 else "0" + + +def export_count_data(stats: dict): + """ + Export dataset statistics to CSV files. + Write each statistic to a separate file, on a single row + """ + for key, outfile_name in KEY_TO_OUTFILE.items(): + logger.info(f"Exporting dataset statistics {key} to: {outfile_name}") + with open(outfile_name, "w") as outfile: + outfile.write(",".join([format_value(val) for val in stats[key]])) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + count_file = args.count_file + + logger.info(f"Computing dataset statistics for {count_file.name}") + count_df = parse_count_table(count_file) + + stat_dict = compute_dataset_statistics(count_df) + + export_count_data(stat_dict) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_gene_statistics.py b/bin/compute_gene_statistics.py new file mode 100755 index 00000000..ec250cf7 --- /dev/null +++ b/bin/compute_gene_statistics.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# outfile names +ALL_GENES_RESULT_OUTFILE_SUFFIX = "stats_all_genes.csv" + +RCV_MULTIFILER = 1.4826 # see https://pmc.ncbi.nlm.nih.gov/articles/PMC9196089/ + +# quantile intervals +NB_QUANTILES = 100 + + +############################################################################ +# POLARS EXTENSIONS +############################################################################ + + +@pl.api.register_expr_namespace("row") +class StatsExtension: + def __init__(self, expr: pl.Expr): + self._expr = expr + + def not_null_values(self): + return self._expr.list.drop_nulls().list + + def mean(self) -> pl.Expr: + """Mean over non nulls values in row""" + return self.not_null_values().mean() + + def std(self) -> pl.Expr: + """Std over non nulls values in row""" + return self.not_null_values().std() + + def median(self) -> pl.Expr: + """Median over non nulls values in row""" + return self.not_null_values().median() + + def mad(self) -> pl.Expr: + """Median Absolute Deviation over non nulls values in row""" + return ( + self.not_null_values() + .eval( + (pl.element() - pl.element().median()).abs().median() + ) # returns a list with one element + .list.first() + ) + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Get base statistics from count data for each gene" + ) + parser.add_argument( + "--imputed-counts", + type=Path, + dest="imputed_count_file", + help="Count file with imputed missing values", + ) + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--ratio-nulls-per-sample", + type=Path, + dest="ratio_nulls_per_samples", + required=True, + help="Ratio of null values per sample", + ) + parser.add_argument( + "--max-ratio-null-valid-sample", + type=float, + dest="max_ratio_null_valid_sample", + required=True, + help="Maximum ratio of null values for a sample to be considered valid", + ) + parser.add_argument("--platform", type=str, help="Platform name") + return parser.parse_args() + + +def get_counts(file: Path) -> pl.DataFrame: + # sorting dataframe (necessary to get consistent output) + return pl.read_parquet(file).sort(config.GENE_ID_COLNAME, descending=False) + + +def get_colname(colname: str, platform: str | None) -> str: + return f"{platform}_{colname}" if platform else colname + + +def get_samples(lf: pl.LazyFrame) -> list[str]: + return lf.select(pl.exclude(config.GENE_ID_COLNAME)).collect_schema().names() + + +def get_valid_samples( + ratio_nulls_per_samples_df: pl.DataFrame, max_ratio_null_valid_sample: float +) -> list[str]: + """ + Get samples whose ratio of null values is below the maximum ratio. + """ + return ( + ratio_nulls_per_samples_df.filter( + pl.col(config.RATIO_COLNAME) <= max_ratio_null_valid_sample + ) + .select(config.SAMPLE_COLNAME) + .to_series() + .to_list() + ) + + +def compute_ratios_null_values( + df: pl.DataFrame, valid_samples: list[str], platform: str | None +) -> pl.DataFrame: + # the samples showing a low gene count will not be taken into account for the zero count penalty + nb_nulls = df.select(pl.exclude(config.GENE_ID_COLNAME).is_null()).sum_horizontal() + + found_valid_samples = [sample for sample in valid_samples if sample in df.columns] + + if found_valid_samples: + nb_nulls_valid_samples = df.select( + pl.col(found_valid_samples).is_null() + ).sum_horizontal() + else: + nb_nulls_valid_samples = nb_nulls + + nb_samples = len(df.columns) - 1 + return df.select( + pl.col(config.GENE_ID_COLNAME), + (nb_nulls / nb_samples).alias( + get_colname(config.RATIO_NULLS_COLNAME, platform) + ), + (nb_nulls_valid_samples / len(found_valid_samples)).alias( + get_colname(config.RATIO_NULLS_VALID_SAMPLES_COLNAME, platform) + ), + ) + + +def get_main_statistics(lf: pl.LazyFrame, platform: str | None) -> pl.LazyFrame: + """ + Compute count descriptive statistics for each gene in the count dataframe. + """ + logger.info("Getting descriptive statistics") + samples = get_samples(lf) + # computing main stats + augmented_count_lf = lf.with_columns( + mean=pl.concat_list(samples).row.mean(), + std=pl.concat_list(samples).row.std(), + median=pl.concat_list(samples).row.median(), + mad=pl.concat_list(samples).row.mad(), + ) + + return augmented_count_lf.select( + pl.col(config.GENE_ID_COLNAME), + pl.col("mean").alias(get_colname(config.MEAN_COLNAME, platform)), + pl.col("std").alias(get_colname(config.STANDARD_DEVIATION_COLNAME, platform)), + pl.col("median").alias(get_colname(config.MEDIAN_COLNAME, platform)), + pl.col("mad").alias(get_colname(config.MAD_COLNAME, platform)), + (pl.col("std") / pl.col("mean")).alias( + get_colname(config.COEFFICIENT_OF_VARIATION_COLNAME, platform) + ), + (pl.col("mad") / pl.col("median") * RCV_MULTIFILER).alias( + get_colname(config.ROBUST_COEFFICIENT_OF_VARIATION_MEDIAN_COLNAME, platform) + ), + ) + + +def compute_ratio_zeros( + count_lf: pl.LazyFrame, stat_lf: pl.LazyFrame, platform: str +) -> pl.LazyFrame: + nb_samples = len(get_samples(count_lf)) + nb_zeros_lf = count_lf.select( + (pl.sum_horizontal(pl.exclude(config.GENE_ID_COLNAME) == 0) / nb_samples).alias( + get_colname(config.RATIO_ZEROS_COLNAME, platform) + ) + ) + # return stat_lf + return pl.concat([stat_lf, nb_zeros_lf], how="horizontal") + + +def get_quantile_intervals(lf: pl.LazyFrame, platform: str) -> pl.LazyFrame: + """ + Compute the quantile intervals for the mean expression levels of each gene in the dataframe. + + The function assigns to each gene a quantile interval of its mean cpm compared to all genes. + """ + logger.info("Getting mean expression quantiles") + mean_colname = get_colname(config.MEAN_COLNAME, platform) + return lf.with_columns( + ( + pl.col(mean_colname).rank(method="ordinal") + / pl.col(mean_colname).count() + * NB_QUANTILES + ) + .floor() + .cast(pl.Int8) + # we want the only value = NB_QUANTILES to be NB_QUANTILES - 1 + # because the last quantile interval is [NB_QUANTILES - 1, NB_QUANTILES] + .replace({NB_QUANTILES: NB_QUANTILES - 1}) + .alias(get_colname(config.EXPRESSION_LEVEL_QUANTILE_INTERVAL_COLNAME, platform)) + ) + + +def export_data(lf: pl.LazyFrame, platform: str | None): + """Export gene expression data to CSV files.""" + outfile = ( + f"{platform}.{ALL_GENES_RESULT_OUTFILE_SUFFIX}" + if platform + else ALL_GENES_RESULT_OUTFILE_SUFFIX + ) + logger.info(f"Exporting statistics for all genes to: {outfile}") + lf.sink_csv(outfile, float_precision=config.CSV_FLOAT_PRECISION) + logger.info("Done") + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + ratio_nulls_per_samples_df = pl.read_csv(args.ratio_nulls_per_samples) + valid_samples = get_valid_samples( + ratio_nulls_per_samples_df, args.max_ratio_null_valid_sample + ) + + logger.info("Loading count data (before missing value imputation)") + non_imputed_count_df = get_counts(args.count_file) + + ratio_nulls_df = compute_ratios_null_values( + non_imputed_count_df, valid_samples, args.platform + ) + + # deleting non_imputed_count_df in order to free unused memory + del non_imputed_count_df + + # if the user provided an imputed count file, use it; otherwise, use the original count file + if args.imputed_count_file: + logger.info("Using imputed count file") + count_file = args.imputed_count_file + else: + logger.info("Using original count file") + count_file = args.count_file + + logger.info("Loading count data...") + count_df = get_counts(count_file) + logger.info( + f"Loaded count data with {count_df.shape[0]} rows and {count_df.shape[1]} columns" + ) + + logger.info("Computing statistics and stability score") + count_lf = count_df.lazy() + # getting expression statistics + stat_lf = get_main_statistics(count_lf, args.platform) + + # adding column for nb of null values for each gene + stat_lf = stat_lf.join( + ratio_nulls_df.lazy(), on=config.GENE_ID_COLNAME, how="inner" + ) + + # adding a column for the frequency of zero values + stat_lf = compute_ratio_zeros(count_lf, stat_lf, args.platform) + + # getting quantile intervals + stat_lf = get_quantile_intervals(stat_lf, args.platform) + + # exporting computed data + export_data(stat_lf, args.platform) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_gene_transcript_lengths.py b/bin/compute_gene_transcript_lengths.py new file mode 100755 index 00000000..1bc009bb --- /dev/null +++ b/bin/compute_gene_transcript_lengths.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import pandas as pd + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +OUTFILE = "gene_transcript_lengths.csv" + +GFF_COLUMNS = [ + "chromosome", + "source", + "feature", + "start", + "end", + "score", + "strand", + "phase", + "attributes", +] + +DTYPES = { + "chromosome": str, + "source": str, + "feature": str, + "start": int, + "end": int, + "score": str, + "strand": str, + "phase": str, + "attributes": str, +} + + +################################################################## +################################################################## +# FUNCTIONS +################################################################## +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Get CDNA lengths from GFF3 annotation file") + parser.add_argument( + "--annotation", + type=Path, + dest="annotation_file", + required=True, + help="Annotation file in GFF3 format", + ) + return parser.parse_args() + + +def parse_gff3_file(annotation_file: Path) -> pd.DataFrame: + return pd.read_csv( + annotation_file, + sep="\t", + names=GFF_COLUMNS, + dtype=DTYPES, + comment="#", + on_bad_lines="warn", + ) + + +def compute_transcript_lengths(df: pd.DataFrame) -> pd.DataFrame: + exon_df = df.loc[df["feature"] == "exon"].copy() + # extract transcript ID from attributes column for each exon + exon_df["transcript_id"] = exon_df["attributes"].str.extract( + r"Parent=transcript:([^;]+)" + ) + # compute transcript length + exon_df[config.CDNA_LENGTH_COLNAME] = exon_df["end"] - exon_df["start"] + 1 + exon_df = exon_df[["transcript_id", config.CDNA_LENGTH_COLNAME]] + return exon_df.groupby("transcript_id", as_index=False).agg( + {config.CDNA_LENGTH_COLNAME: "sum"} + ) + + +def compute_max_transcript_lengths_per_gene( + df: pd.DataFrame, transcript_lengths_df: pd.DataFrame +) -> pd.DataFrame: + rna_cols = [ + feature + for feature in df["feature"].unique() + if "RNA" in feature and "gene" not in feature + ] + rna_df = df.loc[df["feature"].isin(rna_cols)].copy() + + # extract gene ID from attributes column for each transcript + rna_df[config.GENE_ID_COLNAME] = rna_df["attributes"].str.extract( + r"Parent=gene:([^;]+)" + ) + # extract transcript ID from attributes column + rna_df["transcript_id"] = rna_df["attributes"].str.extract(r"ID=transcript:([^;]+)") + + # merge with transcript lengths dataframe to get length + merged_df = rna_df.merge(transcript_lengths_df, how="left", on="transcript_id") + logger.info( + f"Got length for {len(merged_df) / len(rna_df) * 100:.2f}% of transcripts" + ) + # compute max transcript length per gene + merged_df = merged_df[[config.GENE_ID_COLNAME, config.CDNA_LENGTH_COLNAME]] + return merged_df.groupby(config.GENE_ID_COLNAME, as_index=False).agg( + {config.CDNA_LENGTH_COLNAME: "max"} + ) + + +################################################################## +################################################################## +# MAIN +################################################################## +################################################################## + + +def main(): + args = parse_args() + + logger.info("Parsing annotation file") + df = parse_gff3_file(args.annotation_file) + + logger.info("Computing transcript lengths") + transcript_lengths_df = compute_transcript_lengths(df) + + # keep only mRNA and exon features + logger.info("Getting max transcript length per gene") + gene_length_df = compute_max_transcript_lengths_per_gene(df, transcript_lengths_df) + + logger.info(f"Writing to {OUTFILE}") + gene_length_df.to_csv(OUTFILE, index=False, header=True) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_m_measures.py b/bin/compute_m_measures.py new file mode 100755 index 00000000..bc1d783e --- /dev/null +++ b/bin/compute_m_measures.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +M_MEASURE_OUTFILE_NAME = "m_measures.csv" + +DEFAULT_CHUNKSIZE = 300 +NB_GENE_ID_CHUNK_FOLDERS = 100 + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compute M-measure for each gene") + parser.add_argument( + "--counts", + type=Path, + dest="count_file", + required=True, + help="File containing std of lof expression ratios", + ) + parser.add_argument( + "--std-files", + type=str, + dest="std_files", + required=True, + help="File containing std of lof expression ratios", + ) + parser.add_argument( + "--task-attempts", + dest="task_attempts", + type=int, + default=1, + help="Number of task attempts", + ) + return parser.parse_args() + + +def get_nb_rows(lf: pl.LazyFrame): + return lf.select(pl.len()).collect().item() + + +def concat_all_std_data(files: list[Path], low_memory: bool) -> pl.LazyFrame: + lfs = [pl.scan_parquet(file, low_memory=low_memory) for file in files] + lf = pl.concat(lfs) + return ( + lf.explode(config.RATIOS_STD_COLNAME) + .group_by(config.GENE_ID_COLNAME) + .agg(pl.col(config.RATIOS_STD_COLNAME)) + ) + + +def compute_m_measures(lf: pl.LazyFrame) -> pl.LazyFrame: + return lf.select( + pl.col(config.GENE_ID_COLNAME), + ( + pl.col(config.RATIOS_STD_COLNAME).list.sum() + / (pl.col(config.RATIOS_STD_COLNAME).list.len() - 1) + ).alias(config.GENORM_M_MEASURE_COLNAME), + ) + + +def get_chunks(lst: list, chunksize: int): + """Yield successive n-sized chunks from lst.""" + for i in range(0, len(lst), chunksize): + yield lst[i : i + chunksize] + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + low_memory = True if args.task_attempts > 1 else False + files = [Path(file) for file in args.std_files.split(" ")] + + logger.info("Getting list of gene IDs") + count_lf = pl.scan_parquet(args.count_file, low_memory=low_memory) + + ############################################################################# + # MAKING A FOLDER FOR EACH CHUNK OF GENE IDS + ############################################################################# + gene_ids = count_lf.select(config.GENE_ID_COLNAME).collect().to_series().to_list() + gene_ids = sorted(gene_ids) + + chunksize = max( + 1, int(len(gene_ids) / NB_GENE_ID_CHUNK_FOLDERS) + ) # 1 if len(gene_ids) < NB_GENE_ID_CHUNK_FOLDERS + gene_id_list_chunks = list(get_chunks(gene_ids, chunksize=chunksize)) + + gene_id_chunk_folders = [] + for i in range(len(gene_id_list_chunks)): + gene_id_chunk_folder = Path(f"gene_ids_{i}") + gene_id_chunk_folder.mkdir(exist_ok=True) + gene_id_chunk_folders.append(gene_id_chunk_folder) + + ############################################################################# + # EXPORTING GENE DATA TO THEIR RESPECTIVE CHUNK FOLDER + ############################################################################# + # progressively decreasing the chunksize if OOM + chunksize = int(DEFAULT_CHUNKSIZE / args.task_attempts) + chunk_files_list = [ + files[i : i + chunksize] for i in range(0, len(files), chunksize) + ] + + logger.info("Parsing std data by chunks") + for i, chunk_files in enumerate(chunk_files_list): + # parsing files and making a first list concatenation + concat_lf = concat_all_std_data(chunk_files, low_memory) + + # looping through each group of gene IDs + for j, (gene_id_list_chunk, gene_id_chunk_folder) in enumerate( + zip(gene_id_list_chunks, gene_id_chunk_folders) + ): + # writing all data corresponding to this group of gene IDs in a specific folder + outfile = gene_id_chunk_folder / f"chunk.{i}.parquet" + concat_df = concat_lf.filter( + pl.col(config.GENE_ID_COLNAME).is_in(gene_id_list_chunk) + ).collect() + concat_df.write_parquet(outfile) + + ############################################################################# + # GATHERING ALL DATA CHUNK BY CHUNK AND COMPUTING M MEASURE FOR EACH GENE + ############################################################################# + computed_genes = 0 + nb_ratios_per_gene = set() + logger.info( + "Concatenating all std data by chunk of gene IDs and computing M measures" + ) + with open(M_MEASURE_OUTFILE_NAME, "a") as fout: + for i, gene_id_chunk_folder in enumerate(gene_id_chunk_folders): + chunk_files = list(gene_id_chunk_folder.iterdir()) + + concat_lf = concat_all_std_data(chunk_files, low_memory).sort( + config.GENE_ID_COLNAME + ) + + # computing M measures for these gene IDs + m_measure_lf = compute_m_measures(concat_lf) + m_measure_df = m_measure_lf.collect() + + ################################################# + # checks + ################################################# + if m_measure_df[config.GENE_ID_COLNAME].is_duplicated().any(): + raise ValueError("Duplicate values found for gene IDs!") + + process_gene_ids = sorted( + m_measure_df.select(config.GENE_ID_COLNAME).to_series().to_list() + ) + if process_gene_ids != gene_id_list_chunks[i]: + raise ValueError("Incorrect gene IDs found!") + + computed_genes += len(m_measure_df) + + unique_nb_ratios = ( + concat_lf.with_columns( + pl.col(config.RATIOS_STD_COLNAME).list.len().alias("length") + ) + .select("length") + .unique() + .collect() + .to_series() + .to_list() + ) + nb_ratios_per_gene.update(unique_nb_ratios) + + ################################################# + ################################################# + + # appending to output file + if i == 0: + m_measure_df.write_csv( + fout, + include_header=True, + float_precision=config.CSV_FLOAT_PRECISION, + ) + else: + m_measure_df.write_csv( + fout, + include_header=False, + float_precision=config.CSV_FLOAT_PRECISION, + ) + + logger.info(f"Number of gene IDs: {len(gene_ids)}") + logger.info(f"Number of computed genes: {computed_genes}") + if computed_genes != len(gene_ids): + raise ValueError( + f"Number of computed genes: {computed_genes} != number of gene IDs: {len(gene_ids)}" + ) + + if len(nb_ratios_per_gene) > 1: + logger.warning( + f"Got multiple number of std ratios to compute: {list(nb_ratios_per_gene)}" + ) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_stability_scores.py b/bin/compute_stability_scores.py new file mode 100755 index 00000000..ad7fbba2 --- /dev/null +++ b/bin/compute_stability_scores.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from dataclasses import dataclass, field +from pathlib import Path +from typing import ClassVar + +import config +import polars as pl +from common import write_float_csv + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# outfile names +STATISTICS_WITH_SCORES_OUTFILENAME = "stats_with_scores.csv" + + +@dataclass +class StabilityScorer: + N_QUANTILES: ClassVar[int] = 1000 + + WEIGHT_FIELDS: ClassVar[list[str]] = [ + config.NORMFINDER_STABILITY_VALUE_COLNAME, + config.GENORM_M_MEASURE_COLNAME, + config.COEFFICIENT_OF_VARIATION_COLNAME, + config.ROBUST_COEFFICIENT_OF_VARIATION_MEDIAN_COLNAME, + ] + + WEIGHT_RATIO_NB_NULLS_TO_SCORING: ClassVar[float] = 1 + + df: pl.DataFrame + stability_score_weights_str: str + weights: dict[str, float] = field(default_factory=dict) + + def __post_init__(self): + self.parse_stability_score_weights() + self.compute_stability_score() + + def parse_stability_score_weights(self): + for weight_field, weight in zip( + self.WEIGHT_FIELDS, self.stability_score_weights_str.split(",") + ): + self.weights[weight_field] = float(weight) + + def linear_normalise(self, data: pl.Series, new_name: str) -> pl.Series: + """ + Linearly normalise a series + """ + min_val = data.min() + max_val = data.max() + return pl.Series(new_name, (data - min_val) / (max_val - min_val)) + + @staticmethod + def get_normalised_col(col: str) -> str: + return f"{col}_normalised" + + def compute_stability_score(self): + logger.info("Computing stability score for candidate genes") + + # since Normfinder is always run + # we can distinguish between candidate and non-candidate genes easily with this column + self.df = self.df.with_columns( + pl.when(pl.col(config.NORMFINDER_STABILITY_VALUE_COLNAME).is_not_null()) + .then(1) + .otherwise(0) + .alias(config.IS_CANDIDATE_COLNAME) + ) + + # dividing the dataframe into two parts: candidate and non-candidate genes + candidate_df = self.df.filter( + pl.col(config.IS_CANDIDATE_COLNAME) == 1 + ) # keep only candidate genes + non_candidate_df = self.df.filter(pl.col(config.IS_CANDIDATE_COLNAME) == 0) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # DATA NORMALISATION (TO [0, 1]) + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + normalised_data = {} + null_data = {} + weight_sum = 0 + # iterate over columns that can participate in stability score calculation + for col, weight in self.weights.items(): + # if a column is absent, skip it + if col not in self.df.columns: + continue + data = candidate_df.select(col).to_series() + # for each column present, we perform linear transformation to have values between 0 and 1 + # and put these normalised data in another column suffixed with "_normalised" + normalised_col = self.get_normalised_col(col) + normalised_data[col] = self.linear_normalise(data, new_name=normalised_col) + # creating a null column with same name + null_data[col] = pl.Series(normalised_col, [None] * len(non_candidate_df)) + # counting the sum of weights corresponding to the columns present + # so that we can normalise the weights afterwards + weight_sum += weight + + # replacing original data with quantile normalised ones + candidate_df = candidate_df.with_columns( + data for data in normalised_data.values() + ) + # adding null columns to the non-candidate df to allow concatenation + non_candidate_df = non_candidate_df.with_columns( + data for data in null_data.values() + ) + + # concatenating with non candidate genes to have all genes + self.df = pl.concat([candidate_df, non_candidate_df]) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # GENERAL FORMULA FOR STABILITY + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # adding penalty for samples with null values + # genes with at least one zero value are already excluded at that stage + stability_scoring_expr = ( + pl.col(config.RATIO_NULLS_VALID_SAMPLES_COLNAME) + * self.WEIGHT_RATIO_NB_NULLS_TO_SCORING + ) + + for col, weight in self.weights.items(): + if col not in self.df.columns: + logger.warning(f"Column {col} not found in dataframe") + continue + normalised_col = self.get_normalised_col(col) + # we do not want to include null / nan values in the stability score calculation + # because this would result in a total null / nan value for the stability score + stability_scoring_expr += ( + pl.when( + pl.col(normalised_col).is_not_null() + & pl.col(normalised_col).is_not_nan() + ) + .then(pl.col(normalised_col)) + .otherwise(pl.lit(0)) + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + expr = ( + pl.when(pl.col(config.IS_CANDIDATE_COLNAME) == 1) + .then(stability_scoring_expr) + .otherwise(None) + ) + # add stability score column + self.df = self.df.with_columns(expr.alias(config.STABILITY_SCORE_COLNAME)) + + def get_statistics_with_stability_scores(self) -> pl.DataFrame: + return ( + self.df.sort( + config.STABILITY_SCORE_COLNAME, descending=False, nulls_last=True + ) + .with_row_index(name="index") + .with_columns((pl.col("index") + 1).alias(config.RANK_COLNAME)) + .drop("index") + ) + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Computes stability score for each gene" + ) + parser.add_argument( + "--stats", + type=Path, + dest="stats_file", + required=True, + help="Gene Statistics file", + ) + parser.add_argument( + "--normfinder-stability", + type=str, + required=True, + dest="normfinder_stability_file", + help="Output files of Normfinder", + ) + parser.add_argument( + "--genorm-stability", + type=str, + dest="genorm_stability_file", + help="Output files of Genorm", + ) + parser.add_argument( + "--weights", + dest="stability_score_weights", + type=str, + required=True, + help="Weights for Coefficient of Variation / Robust Coefficient of Variation on Median / Normfinder / Genorm respectively. Must be a comma-separated string. Example: 0.7,0.1,0.1,0.1", + ) + return parser.parse_args() + + +def get_stabilities(stability_files: list[Path]) -> pl.DataFrame: + """Retrieve and concatenate stability values from a list of stability files.""" + df = pl.read_csv(stability_files[0]) + if len(stability_files) > 1: + for file in stability_files[1:]: + new_df = pl.read_csv(file) + df = df.join(new_df, on=config.GENE_ID_COLNAME, how="left") + return df.with_columns(pl.lit(1).alias(config.IS_CANDIDATE_COLNAME)) + + +def get_statistics(stat_files: list[Path]) -> pl.DataFrame: + """Retrieve and concatenate data from a list of statistics files.""" + df = pl.read_csv(stat_files[0]) + if len(stat_files) > 1: + for file in stat_files[1:]: + new_df = pl.read_csv(file) + df = df.join(new_df, on=config.GENE_ID_COLNAME, how="left") + return df + + +def export_data(scored_df: pl.DataFrame): + """Export gene expression data to CSV files.""" + logger.info(f"Exporting stability scores to: {STATISTICS_WITH_SCORES_OUTFILENAME}") + write_float_csv(scored_df, STATISTICS_WITH_SCORES_OUTFILENAME) + logger.info("Done") + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + stat_df = pl.read_parquet(args.stats_file) + + stability_files = [ + Path(file) + for file in [args.normfinder_stability_file, args.genorm_stability_file] + if file is not None + ] + + # getting metadata and mappings + stability_df = get_stabilities(stability_files) + # merges base statistics with computed stability measurements + df = stat_df.join(stability_df, on=config.GENE_ID_COLNAME, how="left") + + # sort genes according to the metrics present in the dataframe + stability_scorer = StabilityScorer(df, args.stability_score_weights) + scored_df = stability_scorer.get_statistics_with_stability_scores() + + # exporting computed data + export_data(scored_df) + + +if __name__ == "__main__": + main() diff --git a/bin/compute_tpm.py b/bin/compute_tpm.py new file mode 100755 index 00000000..0dd889f8 --- /dev/null +++ b/bin/compute_tpm.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from pathlib import Path + +import config +import polars as pl +from common import ( + compute_log2, + export_parquet, + parse_count_table, + parse_table, +) + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +OUTFILE_SUFFIX = ".tpm.parquet" + +WARNING_REASON_FILE = "warning_reason.txt" +FAILURE_REASON_FILE = "failure_reason.txt" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Normalise data to TPM") + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--gene-lengths", + type=Path, + dest="gene_lengths_file", + required=True, + help="Gene lengths file (CSV format)", + ) + return parser.parse_args() + + +def try_cast_to_int(df: pl.DataFrame) -> pl.DataFrame: + """Try casting columns to integers.""" + count_columns = df.select(pl.exclude(config.GENE_ID_COLNAME)).columns + # try casting to handle integer values that are float-formated like 1.0 + for col in count_columns: + is_all_integers = df.select(pl.col(col).round().eq(pl.col(col)).all()).item() + if is_all_integers: + df = df.with_columns(pl.col(col).cast(pl.Int64())) + return df + + +def is_raw_counts(df: pl.DataFrame) -> bool: + """Check if the data are raw counts (integers).""" + count_columns = df.select(pl.exclude(config.GENE_ID_COLNAME)).columns + return all( + dtype + in ( + pl.Int8(), + pl.Int16(), + pl.Int32(), + pl.Int64(), + pl.UInt8(), + pl.UInt16(), + pl.UInt32(), + pl.UInt64(), + ) + for dtype in df.select(count_columns).schema.values() + ) + + +def is_tpm(df: pl.DataFrame) -> bool: + """Check if the data are TPM (sum to 1e6 per sample).""" + sample_sums_df = df.select(pl.exclude(config.GENE_ID_COLNAME).sum()) + # a small error is possible, and we assume that if the sum is close to 1e6, it is TPM + # setting the tolerance to 100 + is_tpm_col_df = sample_sums_df.select((pl.all() - 1e6).abs() < 1e2) + return is_tpm_col_df.select( + pl.any_horizontal(pl.all()) + ).item() # Allow for floating-point precision + + +def compute_rpkm(df: pl.DataFrame, cdna_length_df: pl.DataFrame) -> pl.DataFrame: + """ + Process raw counts to RPKM. + """ + logger.info("Computing RPKM.") + df = df.join(cdna_length_df, on=config.GENE_ID_COLNAME) + return df.select( + pl.col(config.GENE_ID_COLNAME), + pl.exclude([config.GENE_ID_COLNAME, config.CDNA_LENGTH_COLNAME]).truediv( + pl.col(config.CDNA_LENGTH_COLNAME) + ), + ) + + +def compute_tpm_from_rpkm(rpkm_df: pl.DataFrame) -> pl.DataFrame: + """ + Process RPKM to TPM. + """ + logger.info("Computing TPM from RPKM.") + sums = rpkm_df.select(pl.exclude(config.GENE_ID_COLNAME).sum()) + # Divide each column by its sum and multiply by 1e6 + count_columns = rpkm_df.select(pl.exclude(config.GENE_ID_COLNAME)).columns + return rpkm_df.select( + [pl.col(config.GENE_ID_COLNAME)] + + [(pl.col(col) / sums[col][0] * 1e6).alias(col) for col in count_columns], + ) + + +def compute_tpm(df: pl.DataFrame, cdna_length_df: pl.DataFrame) -> pl.DataFrame: + """ + Process raw counts, FPKM, or RPKM to TPM. + """ + if is_raw_counts(df): + logger.info("Raw counts detected → computing TPM directly.") + rpkm_df = compute_rpkm(df, cdna_length_df) + return compute_tpm_from_rpkm(rpkm_df) + elif is_tpm(df): + logger.info("Data are already TPM. No conversion needed.") + return df + else: + # Convert FPKM/RPKM to TPM + logger.info("Assuming FPKM/RPKM normalisation.") + return compute_tpm_from_rpkm(df) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + try: + logger.info("Parsing data") + count_df = parse_count_table(args.count_file) + cdna_length_df = parse_table(args.gene_lengths_file) + + logger.info("Converting data types") + count_df = try_cast_to_int(count_df) + + logger.info(f"Normalising {args.count_file.name}") + count_df = compute_tpm(count_df, cdna_length_df) + + logger.info("Computing log2 values") + count_df = compute_log2(count_df) + + export_parquet(count_df, args.count_file, OUTFILE_SUFFIX) + + except Exception as e: + logger.error(f"Error occurred while normalising data: {e}") + msg = "UNEXPECTED ERROR" + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/bin/config.py b/bin/config.py new file mode 100644 index 00000000..fe217991 --- /dev/null +++ b/bin/config.py @@ -0,0 +1,44 @@ +# general column names +GENE_ID_COLNAME = "gene_id" +GENE_ID_COUNT_COLNAME = "count" +CDNA_LENGTH_COLNAME = "length" +RANK_COLNAME = "rank" + +# base statistics +COEFFICIENT_OF_VARIATION_COLNAME = "coefficient_of_variation" +ROBUST_COEFFICIENT_OF_VARIATION_MEDIAN_COLNAME = ( + "robust_coefficient_of_variation_median" +) +STANDARD_DEVIATION_COLNAME = "standard_deviation" +STABILITY_SCORE_COLNAME = "stability_score" +MEAN_COLNAME = "mean" +MEDIAN_COLNAME = "median" +MAD_COLNAME = "median_absolute_deviation" +EXPRESSION_LEVEL_STATUS_COLNAME = "expression_level_status" +EXPRESSION_LEVEL_QUANTILE_INTERVAL_COLNAME = "expression_level_quantile_interval" +RATIO_NULLS_COLNAME = "ratio_nulls_in_all_samples" +RATIO_NULLS_VALID_SAMPLES_COLNAME = "ratio_nulls_in_valid_samples" +RATIO_ZEROS_COLNAME = "ratio_zeros" +IS_CANDIDATE_COLNAME = "is_candidate" + +# dataset statistics +KS_TEST_COLNAME = "kolmogorov_smirnov_pvalue" + +# count dataframe +GENE_COUNT_COLNAME = "count" +SAMPLE_COLNAME = "sample" +RATIO_COLNAME = "ratio" + +# gene metadata +ORIGINAL_GENE_ID_COLNAME = "original_gene_id" +ORIGINAL_GENE_IDS_COLNAME = "original_gene_ids" +GENE_NAME_COLNAME = "name" +GENE_DESCRIPTION_COLNAME = "description" +SECTION_COLNAME = "section" + +# computed stability values +NORMFINDER_STABILITY_VALUE_COLNAME = "normfinder_stability_value" +GENORM_M_MEASURE_COLNAME = "genorm_m_measure" +RATIOS_STD_COLNAME = "ratios_stds" + +CSV_FLOAT_PRECISION = 6 diff --git a/bin/detect_rare_genes.py b/bin/detect_rare_genes.py new file mode 100755 index 00000000..02cc2831 --- /dev/null +++ b/bin/detect_rare_genes.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +from common import parse_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +VALID_GENE_IDS_OUTFILE = "valid_gene_ids.txt" +TOTAL_OCCURRENCES_OUTFILE = "total_gene_id_occurrence_quantiles.csv" + +################################################################## +# FUNCTIONS +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Get genes with good occurrence") + parser.add_argument( + "--occurrences", + type=Path, + required=True, + dest="gene_id_occurrence_file", + help="Input file containing gene ID occurrences", + ) + parser.add_argument( + "--mappings", + type=Path, + required=True, + dest="mapping_file", + help="Mapping file containing gene IDs", + ) + parser.add_argument( + "--nb-datasets", + type=int, + required=True, + dest="nb_datasets", + help="Number of datasets", + ) + parser.add_argument( + "--min-occurrence-frequency", + type=float, + required=True, + dest="min_occurrence_frequency", + help="Minimum frequency of occurrences for a gene among all datasets", + ) + parser.add_argument( + "--min-occurrence-quantile", + type=float, + required=True, + dest="min_occurrence_quantile", + help="Minimum frequency of occurrences for a gene among all datasets", + ) + return parser.parse_args() + + +################################################################## +# MAIN +################################################################## + + +def main(): + args = parse_args() + + original_gene_id_occurrence_df = parse_table(args.gene_id_occurrence_file) + mapping_df = parse_table(args.mapping_file) + nb_mapped_genes = len(mapping_df) + + df = original_gene_id_occurrence_df.join( + mapping_df, + on=config.ORIGINAL_GENE_ID_COLNAME, + ) + + total_gene_id_occurrence_df = df.group_by(config.GENE_ID_COLNAME).agg( + pl.col(config.GENE_ID_COUNT_COLNAME).sum().alias("total_occurrences") + ) + + df = ( + df.join( + total_gene_id_occurrence_df, + on=config.GENE_ID_COLNAME, + ) + .with_columns( + total_occurrences_quantile=( + pl.col("total_occurrences").rank(method="max") + / pl.col("total_occurrences").count() + ), + total_occurrences_frequency=( + pl.col("total_occurrences") / args.nb_datasets + ), + ) + .select( + [ + config.GENE_ID_COLNAME, + "total_occurrences_frequency", + "total_occurrences_quantile", + ] + ) + .unique() + ) + + # sorting (for output consistency) + df = df.sort(["total_occurrences_quantile", "gene_id"], descending=[True, False]) + + # writing total occurrences in a csv before filtering + df.select([config.GENE_ID_COLNAME, "total_occurrences_quantile"]).write_csv( + TOTAL_OCCURRENCES_OUTFILE + ) + + # filtering genes + valid_gene_ids = ( + df.filter(pl.col("total_occurrences_quantile") >= args.min_occurrence_quantile) + .filter(pl.col("total_occurrences_frequency") >= args.min_occurrence_frequency) + .select(config.GENE_ID_COLNAME) + .unique() + .to_series() + .to_list() + ) + + with open(VALID_GENE_IDS_OUTFILE, "w") as f: + f.write("\n".join(valid_gene_ids)) + + nb_valid_genes = len(valid_gene_ids) + + logger.info( + f"Found {nb_valid_genes} valid gene IDs ({nb_valid_genes / nb_mapped_genes:.2%})" + ) + + +if __name__ == "__main__": + main() diff --git a/bin/download_eatlas_data.R b/bin/download_eatlas_data.R new file mode 100755 index 00000000..2684908a --- /dev/null +++ b/bin/download_eatlas_data.R @@ -0,0 +1,233 @@ +#!/usr/bin/env Rscript + +# Written by Olivier Coen. Released under the MIT license. + +options(error = traceback) +suppressPackageStartupMessages(library("ExpressionAtlas")) +library(ExpressionAtlas) +library(optparse) + +FAILURE_REASON_FILE <- "failure_reason.txt" +WARNING_REASON_FILE <- "warning_reason.txt" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + +get_args <- function() { + option_list <- list( + make_option("--accession", type = "character", help = "Accession number of expression atlas experiment. Example: E-MTAB-552") + ) + + args <- parse_args(OptionParser( + option_list = option_list, + description = "Get expression atlas data" + )) + return(args) +} + +download_expression_atlas_data_with_retries <- function(accession, max_retries = 3, wait_time = 5) { + success <- FALSE + attempts <- 0 + + while (!success && attempts < max_retries) { + attempts <- attempts + 1 + + tryCatch({ + atlas_data <- ExpressionAtlas::getAtlasData( accession ) + success <- TRUE + + }, warning = function(w) { + + # if the accession os not valid, we stop immediately (useless to keep going) + if (grepl("does not look like an ArrayExpress/BioStudies experiment accession.", w$message)) { + warning(w$message) + write("EXPERIMENT NOT FOUND", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + + # else, retrying + message("Attempt ", attempts, " Warning: ", w$message) + + if (attempts < max_retries) { + warning("Retrying in ", wait_time, " seconds...") + Sys.sleep(wait_time) + + } else { + + if (grepl("550 Requested action not taken; file unavailable", w$message)) { + warning(w$message) + write("EXPERIMENT SUMMARY NOT FOUND", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } else if (grepl("Failure when receiving data from the peer", w$message)) { + warning(w$message) + write("EXPERIMENT NOT FOUND", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } else if (grepl("FTP status was", w$message)) { + warning(w$message) + write("FTP ERROR", file = FAILURE_REASON_FILE) + quit(save = "no", status = 101) + } else { + warning("Unhandled warning: ", w$message) + write("UNKNOWN ERROR", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + } + + }, error = function(e) { + + message("Attempt ", attempts, " Message: ", e$message) + + if (attempts < max_retries) { + warning("Retrying in ", wait_time, " seconds...") + Sys.sleep(wait_time) + + } else { + + if (grepl("Download appeared successful but no experiment summary object was found", e$message)) { + warning(e$message) + write("EXPERIMENT SUMMARY NOT FOUND", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } else { + warning("Unhandled error: ", e$message) + write("UNKNOWN ERROR", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + + } + }) + } + + return(atlas_data) +} + +get_rnaseq_data <- function(data) { + return(list( + count_data = assays( data )$counts, + platform = 'rnaseq', + count_type = 'raw', # rnaseq data are raw in ExpressionAtlas + sample_groups = colData(data)$AtlasAssayGroup + )) +} + +get_one_colour_microarray_data <- function(data) { + return(list( + count_data = exprs( data ), + platform = 'microarray', + count_type = 'normalised', # one colour microarray data are already normalised in ExpressionAtlas + sample_groups = phenoData(data)$AtlasAssayGroup + )) +} + +get_batch_id <- function(accession, data_type) { + batch_id <- paste0(accession, '_', data_type) + # cleaning + batch_id <- gsub("-", "_", batch_id) + return(batch_id) +} + +get_new_sample_names <- function(result, batch_id) { + new_colnames <- paste0(batch_id, '_', colnames(result$count_data)) + return(new_colnames) +} + +export_count_data <- function(result, batch_id) { + + # renaming columns, to make them specific to accession and data type + colnames(result$count_data) <- get_new_sample_names(result, batch_id) + + outfilename <- paste0(batch_id, '.', result$platform, '.', result$count_type, '.counts.csv') + + # exporting to CSV file + # index represents gene names + cat(paste('Exporting count data to file', outfilename)) + write.table(result$count_data, outfilename, sep = ',', row.names = TRUE, col.names = TRUE, quote = FALSE) +} + +export_metadata <- function(result, batch_id) { + + new_colnames <- get_new_sample_names(result, batch_id) + batch_list <- rep(batch_id, length(new_colnames)) + + df <- data.frame( + batch = batch_list, + condition = result$sample_groups, + sample = new_colnames + ) + + outfilename <- paste0(batch_id, '.design.csv') + cat(paste('Exporting design data to file', outfilename)) + write.table(df, outfilename, sep = ',', row.names = FALSE, col.names = TRUE, quote = FALSE) +} + + +process_data <- function(atlas_data, accession) { + + eset <- atlas_data[[ accession ]] + + # looping through each data type (ex: 'rnaseq') in the experiment + for (data_type in names(eset)) { + + data <- eset[[ data_type ]] + + skip_iteration <- FALSE + # getting count dataframe + tryCatch({ + + if ( data_type == 'rnaseq' ) { + result <- get_rnaseq_data(data) + } else if ( startsWith(data_type, 'A-') ) { # typically: A-AFFY- or A-GEOD- + result <- get_one_colour_microarray_data(data) + } else { + warning(paste("Unknown data type:", data_type)) + write(paste("UNKNOWN DATA TYPE:", data_type), file = WARNING_REASON_FILE, append=TRUE) + skip_iteration <<- TRUE + } + + }, error = function(e) { + warning(paste("Caught an error: ", e$message)) + write(paste('ERROR: COULD NOT GET ASSAY DATA FOR EXPERIMENT ID', accession, 'AND DATA TYPE', data_type), file = WARNING_REASON_FILE, append=TRUE) + skip_iteration <<- TRUE + }) + + # If an error occurred, skip to the next iteration + if (skip_iteration) { + next + } + + batch_id <- get_batch_id(accession, data_type) + + # exporting count data to CSV + export_count_data(result, batch_id) + + # exporting metadata to CSV + export_metadata(result, batch_id) + } + +} + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + +args <- get_args() + +cat(paste("Getting data for accession", args$accession, "\n")) + +accession <- trimws(args$accession) +if (startsWith(accession, "E-PROT")) { + warning("Ignoring the ", accession, " experiment.") + write("PROTEOME ACCESSIONS NOT HANDLED", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) +} + +# searching and downloading expression atlas data +atlas_data <- download_expression_atlas_data_with_retries(args$accession) + +# writing count data in atlas_data to specific CSV files +process_data(atlas_data, args$accession) diff --git a/bin/download_geo_data.R b/bin/download_geo_data.R new file mode 100755 index 00000000..2686f13c --- /dev/null +++ b/bin/download_geo_data.R @@ -0,0 +1,827 @@ +#!/usr/bin/env Rscript + +# Written by Olivier Coen. Released under the MIT license. + +suppressPackageStartupMessages(library("GEOquery")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("tibble")) +suppressPackageStartupMessages(library("stringr")) +library(GEOquery) +library(optparse) +library(dplyr) +library(tibble) +library(stringr) + +options(error = traceback) + +COUNT_FILE_EXTENSION <- ".counts.csv" +DESIGN_FILE_EXTENSION <- ".design.csv" +MAPPING_FILE_EXTENSION <- ".sample_name_mapping.csv" +METADATA_FILE_EXTENSION <- ".platform_metadata.csv" +BASE_REJECTED_DIR <- "rejected" + +FAILURE_REASON_FILE <- "failure_reason.txt" +WARNING_REASON_FILE <- "warning_reason.txt" + +##################################################### +##################################################### +# ARG PARSER +##################################################### +##################################################### + +get_args <- function() { + option_list <- list( + make_option("--accession", type = "character", help = "Accession number of GEO dataset. Example: GSE56413"), + make_option("--species", type = "character", help = "Species name") + ) + + args <- parse_args(OptionParser( + option_list = option_list, + description = "Get GEO data" + )) + return(args) +} + + +##################################################### +##################################################### +# UTILS +##################################################### +##################################################### + +format_species_name <- function(x) { + x <- tools::toTitleCase(x) + x <- gsub("[_-]", " ", x) + return(x) +} + +write_warning <- function(msg) { + message(msg) + file_conn <- file( WARNING_REASON_FILE, open = "a") + cat(paste0(msg, "; "), file = file_conn, sep = "", fill = FALSE) + close(file_conn) +} + + +get_extensions <- function(file){ + extensions <- strsplit(basename(file), split="\\.")[[1]] + return(extensions) +} + + +get_rejected_dir <- function(platform, series) { + rejected_dir <- file.path(BASE_REJECTED_DIR, paste0(series$accession, '_', platform$id)) + dir.exists(rejected_dir) || dir.create(rejected_dir, recursive = TRUE) + return(rejected_dir) +} + + +clean_column_names <- function(df){ + + if (length(unique(colnames(df))) < length(colnames(df))){ + colnames(df) <- paste0(colnames(df), '_', seq_along(df)) + return(df) + } +} + + +##################################################### +##################################################### +# DOWNLOAD +##################################################### +##################################################### + +download_geo_data_with_retries <- function(accession, max_retries = 3, wait_time = 5) { + + success <- FALSE + attempts <- 0 + + while (!success && attempts < max_retries) { + attempts <- attempts + 1 + + tryCatch({ + geo_data <- GEOquery::getGEO( accession ) + success <- TRUE + + }, error = function(e) { + + message("Attempt ", attempts, " Message: ", e$message) + + if (attempts < max_retries) { + warning("Retrying in ", wait_time, " seconds...") + Sys.sleep(wait_time) + + } else { + warning("Unhandled error: ", e$message) + write("EXPERIMENT NOT FOUND", file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + }) + + } + return(geo_data) +} + +##################################################### +##################################################### +# PARSE SERIES / PLATFORM METADATA +##################################################### +##################################################### + +get_experiment_data <- function(geo_data) { + data <- geo_data[[1]] + experiment_data <- experimentData(data) + #print(experiment_data) + return(experiment_data) +} + + +get_experiment_type <- function(geo_data) { + experiment_data <- get_experiment_data(geo_data) + experiment_type <- tolower(attr(experiment_data, "other")$type) + if (experiment_type == "expression profiling by high throughput sequencing") { + return("rnaseq") + } else if (experiment_type == "expression profiling by array") { + return("microarray") + } else { + return(gsub("\n", " ; ", experiment_type)) + } +} + +get_series_species <- function(geo_data) { + message("Getting species included in series") + species_list <- list() + for (i in 1:length(geo_data)) { + data <- geo_data[[ i ]] + metadata <- pData(data) + li <- unique(metadata$organism_ch1) + # check if organism_ch2 exists + if ("organism_ch2" %in% colnames(metadata)) { + li <- append(li, unique(metadata$organism_ch2)) + } + species_list[[i]] <- li + } + species_list <- unique(unlist(species_list)) + return(species_list) +} + + +get_series_supplementary_data <- function(geo_data, series) { + series_species <- get_series_species(geo_data) + if (length(series_species) > 1) { + message(paste("Multiple species found in series:", paste(series_species, collapse = ", "), ". Will not download supplementary data")) + return(list()) + } else if (length(series_species) == 0) { + message("No species found in series...") + return(list()) + } else { + if (series_species != series$species) { + message(paste("Species provided by the user:", series_species, "does not match species in GEO data:", series$species)) + return(list()) + } + experiment_data <- get_experiment_data(geo_data) + suppl_data_str <- attr(experiment_data, "other")$supplementary_file + return(stringr::str_split(suppl_data_str, "\n")[[1]]) + } +} + + +get_platform_id <- function(metadata) { + platform_id <- as.character(unique(metadata$platform_id))[1] + return(platform_id) +} + + +##################################################### +##################################################### +# RNASEQ SAMPLES +##################################################### +##################################################### + +get_rnaseq_samples <- function(geo_data, design_df) { + + rnaseq_sample_df_list <- list() + for (i in 1:length(geo_data)) { + data <- geo_data[[ i ]] + metadata <- pData(data) + if (!("library_strategy" %in% colnames(metadata))) { + message("library_strategy column not found in metadata") + next + } + rnaseq_sample_df_list[[i]] <- metadata %>% + filter(library_strategy == "RNA-Seq" & geo_accession %in% design_df$sample) %>% + select(geo_accession) + } + # concatenate rows + rnaseq_sample_df <- Reduce( + function(df1, df2) dplyr::bind_rows(df1, df2), + rnaseq_sample_df_list + ) + return(rnaseq_sample_df$geo_accession) +} + + +##################################################### +##################################################### +# SAMPLE NAME MAPPING +##################################################### +##################################################### + + +make_sample_name_mapping <- function(geo_data) { + message("Making sample name mapping") + mapping_df_list <- list() + for (i in 1:length(geo_data)) { + data <- geo_data[[ i ]] + metadata <- pData(data) + mapping_df_list[[i]] <- metadata %>% + mutate( + sample_id = geo_accession, + sample_name = title + ) %>% + select(sample_id, sample_name) + } + # concatenate rows + mapping_df <- Reduce( + function(df1, df2) dplyr::bind_rows(df1, df2), + mapping_df_list + ) + return(mapping_df) +} + +rename_columns <- function(df, mapping_df) { + id_map <- setNames(mapping_df$sample_id, mapping_df$sample_name) + names(df) <- ifelse( + names(df) %in% names(id_map), + id_map[names(df)], + names(df) + ) + return(df) +} + +##################################################### +##################################################### +# DESIGN +##################################################### +##################################################### + +get_samples_for_species <- function(metadata, species) { + # check if organism_ch2 exists + if ("organism_ch2" %in% colnames(metadata)) { + keep <- metadata$organism_ch1 == species & metadata$organism_ch2 == species + } else { + keep <- metadata$organism_ch1 == species + } + + # return a data.frame with matching samples + return(metadata$geo_accession[keep]) +} + + +get_columns_for_grouping <- function(df) { + + base_columns <- c("characteristics", "treatment_protocol", "label_protocol", "extract_protocol", "growth_protocol") + + columns_to_group <- c() + for (base_col in base_columns) { + ch1_col <- paste0(base_col, "_ch1") + ch2_col <- paste0(base_col, "_ch2") + + if (ch1_col %in% colnames(df)) { + columns_to_group <- c(columns_to_group, ch1_col) + } + if (ch2_col %in% colnames(df)) { + columns_to_group <- c(columns_to_group, ch2_col) + } + } + + return(columns_to_group) +} + + +build_design_dataframe <- function(df, accession) { + columns_to_group <- get_columns_for_grouping(df) + + design_df <- df %>% + mutate(sample = geo_accession) %>% # change column name geo_accession to sample + group_by(!!!syms(columns_to_group)) %>% # group by all columns for grouping found + mutate(group_num = cur_group_id()) %>% # create column made from group id + ungroup() %>% + mutate( + condition = paste0("G", group_num), # create condition column from group number + batch = accession + ) %>% + select(sample, condition, batch) %>% + arrange(condition) + + return(design_df) +} + + +get_design_for_platform <- function(design_df, metadata) { + platform_samples <- metadata$geo_accession + platform_design_df <- design_df %>% + filter(sample %in% platform_samples) + return(platform_design_df) +} + +get_design_for_rnaseq <- function(design_df, rnaseq_samples) { + rnaseq_design_df <- design_df %>% + filter(sample %in% rnaseq_samples) + return(rnaseq_design_df) +} + + +make_design <- function(metadata, series) { + design_df <- build_design_dataframe(metadata, series$accession) + # get samples corresponding to species + species_samples <- get_samples_for_species(metadata, series$species) + # filter design dataframe + design_df <- design_df %>% + filter(sample %in% species_samples) + return(design_df) +} + + +make_overall_design <- function(geo_data, series) { + message("Making overall design") + design_df_list <- list() + for (i in 1:length(geo_data)) { + data <- geo_data[[ i ]] + metadata <- pData(data) + #print(metadata) + # make design dataframe + # keep only samples corresponding to the species of interest + design_df <- make_design(metadata, series) + design_df_list[[i]] <- design_df + } + # full outer join + design_df <- Reduce( + function(df1, df2) dplyr::bind_rows(df1, df2), + design_df_list + ) + return(design_df) +} + + +##################################################### +##################################################### +# PARSE COUNTS FROM DATA +##################################################### +##################################################### + + +get_microarray_counts <- function(platform) { + # get count data corresponding to samples in the design + counts <- data.frame(exprs(platform$data)) %>% + select(all_of(platform$design$sample)) + # for now, only one element in the list + return(counts) +} + +parse_first_line <- function(filename, sep){ + tryCatch({ + counts <- read.table(filename, header = FALSE, sep = sep, row.names = 1, nrows = 1) + return(counts) + }, error = function(e) { + write_warning(paste("ERROR PARSING FIRST LINE IN", filename)) + return(NULL) + }) +} + +download_file <- function(data_url, filename){ + tryCatch({ + download.file(data_url, filename, method = "wget", quiet = TRUE) + return("SUCCESS") + }, error = function(e) { + write_warning(paste("ERROR WHILE DOWNLOADING:", filename)) + return("FAILURE") + }) +} + + +get_raw_counts_from_url <- function(data_url) { + + if ( tolower(data_url) == "none" || is.na(data_url) || data_url == "") { + write_warning(paste("MISFORMED URL:", data_url)) + return(NULL) + } + + filename <- tolower(basename(data_url)) + extensions <- get_extensions(filename) + ext <- extensions[length(extensions)] + if (ext == "gz") { + ext <- extensions[length(extensions) - 1] + } + if (!(ext %in% c("txt", "tsv", "csv", "tab"))) { + write_warning(paste("UNSUPPORTED EXTENSION:", ext, "for URL:", data_url)) + return(NULL) + } + + message(paste("Downloading", filename)) + download_status <- download_file(data_url, filename) + if (download_status == "FAILURE") { + return(NULL) + } + + separator <- NULL + for (sep in c("\t", ",", " ")) { + + # parsing the first line to determine the separator and see if there is a header + first_line <- parse_first_line(filename, sep) + if (is.null(first_line)) { + return(NULL) + } + + if (ncol(first_line) > 0) { + separator <- sep + if (is.numeric(first_line[1, 1])) { + has_header <- FALSE + } else { + has_header <- TRUE + } + break + } + } + + if (is.null(separator)) { + write_warning(paste("NO VALID SEPARATOR:", filename)) + return(NULL) + } + + message(paste("Parsing", filename)) + tryCatch({ + counts <- read.table(filename, header = has_header, sep = separator, row.names = 1) + }, error = function(e) { + write_warning(paste("ERROR WHILE PARSING", filename)) + return(NULL) + }) + + # removes rows that are all NA + counts <- counts[rowSums(!is.na(counts)) > 0, , drop = FALSE] + return(counts) +} + + +get_all_rnaseq_counts <- function(platform) { + pdata <- platform$metadata + # getting list of samples + samples <- pdata$geo_accession + # getting list of columns corresponding to supp data + # IMPORTANT: we assume here that data are of the same type (raw, TPM, FPKM, etc.) in each supplementary file column + supplementary_cols <- grep("^supplementary_file(_\\d)?$", names(pdata), value = TRUE) + + if (length(supplementary_cols) == 0) { + message("No supplementary files found") + return(data.frame()) + } else if (length(supplementary_cols) > 1) { + message("Multiple supplementary files found") + } + + suppl_df_cpt <- 1 + suppl_count_dfs <- list() + # building one count dataframe by type of suppl data + for (i in 1:length(supplementary_cols)) { + + count_df_list <- list() + cpt = 1 + for (j in 1:length(samples)) { + sample <- samples[[j]] + data_url <- pdata[pdata$geo_accession == sample, supplementary_cols[i]] + + counts <- get_raw_counts_from_url(data_url) + if (is.null(counts)) { + next + } + + if (ncol(counts) == 1) { + colnames(counts) <- c(sample) + } else { + # if multiple columns, we don't know how to deal with it + # nut it will be filtered out later at column match checking + message(paste("Multiple columns found for sample", sample)) + } + + # in case there is already a gene_id column, remove it + if ("gene_id" %in% names(counts)) { + counts <- counts[, -which(names(counts) == "gene_id")] + } + # setting the row names (gene ids) as a column + counts <- tibble::rownames_to_column(counts, var = "gene_id") + # adding to list + count_df_list[[cpt]] <- counts + cpt = cpt + 1 + } + + # checking if all files were skipped + if (length(count_df_list) == 0) { + message("No valid files found") + next + } + + # full outer join + joined_df <- Reduce( + function(df1, df2) merge(df1, df2, by = "gene_id", all = TRUE), + count_df_list + ) + # setting the column gene_id as row names + joined_df <- tibble::column_to_rownames(joined_df, var = "gene_id") + # cleaning column names in case of duplicates + # it should happen only when there were multiple columns for the same sample + joined_df <- clean_column_names(joined_df) + + suppl_count_dfs[[suppl_df_cpt]] <- joined_df + suppl_df_cpt = suppl_df_cpt + 1 + } + return(suppl_count_dfs) +} + + +##################################################### +##################################################### +# DATA QUALITY CONTROL +##################################################### +##################################################### + +is_valid_microarray <- function(counts, platform) { + + if (!all(colnames(counts) %in% platform$design$sample)) { + message("Column names do not match samples in design") + return(FALSE) + } + + vals <- unlist(counts, use.names = FALSE) + vals <- vals[!is.na(vals)] + + all_integers <- all(abs(vals - round(vals)) < 1e-8) + value_range <- range(vals, na.rm = TRUE) + + if (value_range[2] <= 20) { + message(paste(platform$id, ": normalized, log2 scale (e.g. RMA, quantile)")) + return(TRUE) + } else if (all_integers) { + write_warning(paste(platform$id, ": RAW PROBE INTENSITIES FOUND")) + return(FALSE) + } else if (value_range[2] > 1000) { + write_warning(paste(platform$id, ": PARSED INTENSITIES: NORMALIZED BUT NOT LOG-TRANSFORMED")) + return(FALSE) + } else { + write_warning(paste(platform$id, ": UNCLEAR DATA ORIGIN: CHECK GEO METADATA")) + return(FALSE) + } +} + +is_valid_rnaseq <- function(counts, platform) { + + if (!all(colnames(counts) %in% platform$design$sample)) { + message(paste(platform$id, ": column names do not match samples in design")) + return(FALSE) + } + + return(TRUE) +} + + +check_rnaseq_normalisation_state <- function(counts, platform) { + + # checking if all values are integers + tryCatch({ + is_all_integer <- function(x) all(floor(x) == x) + int_counts <- counts %>% + select_if(is_all_integer) + + # if all or the majority of values are decimals + if (nrow(int_counts) < nrow(counts) * 0.01 ) { + return("normalised") + } else if (nrow(int_counts) == nrow(counts)) { + return("raw") + } else { + return("unknown") + } + + }, error = function(e) { + write_warning(paste(platform$id, ": COULD NOT COMPUTE FLOOR")) + return("unknown") + }) + +} + + +##################################################### +##################################################### +# EXPORT +##################################################### +##################################################### + +export_count_data <- function(data, platform, series) { + # renaming columns, to make them specific to accession and data type + colnames(data$counts) <- paste0(series$accession, '_', colnames(data$counts)) + outfilename <- paste0(series$accession, '_', platform$id, '.', platform$type, '.', data$norm_state, COUNT_FILE_EXTENSION) + if (!data$is_valid) { + outfilename <- file.path(get_rejected_dir(platform, series), outfilename) + } + + # exporting to CSV file + # index represents gene names + message(paste(platform$id, ': exporting count data to file', outfilename)) + write.table(data$counts, outfilename, sep = ',', row.names = TRUE, col.names = TRUE, quote = FALSE) +} + + +export_design <- function(data, platform, series) { + new_sample_names <- paste0(series$accession, '_', series$design$sample) + design_df <- series$design %>% + mutate(sample = new_sample_names ) %>% + select(sample, condition, batch) + + outfilename <- paste0(series$accession, '_', platform$id, '.', platform$type,'.', data$norm_state, DESIGN_FILE_EXTENSION) + if (!data$is_valid) { + outfilename <- file.path(get_rejected_dir(platform, series), outfilename) + } + + message(paste(platform$id, ': exporting design data to file', outfilename)) + write.table(design_df, outfilename, sep = ',', row.names = FALSE, col.names = TRUE, quote = FALSE) +} + + +export_name_mapping <- function(data, platform, series) { + outfilename <- paste0(series$accession, '_', platform$id, '.', platform$type, '.', data$norm_state, MAPPING_FILE_EXTENSION) + if (!data$is_valid) { + outfilename <- file.path(get_rejected_dir(platform, series), outfilename) + } + message(paste(platform$id, ': exporting design data to file', outfilename)) + write.table(series$mapping, outfilename, sep = ',', row.names = FALSE, col.names = TRUE, quote = FALSE) +} + +export_metadata <- function(data, platform, series) { + outfilename <- paste0(series$accession, '_', platform$id, '.', platform$type, '.', data$norm_state, METADATA_FILE_EXTENSION) + if (!data$is_valid) { + outfilename <- file.path(get_rejected_dir(platform, series), outfilename) + } + message(paste(platform$id, ': exporting metadata to file', outfilename)) + write.table(platform$metadata, outfilename, sep = ',', row.names = FALSE, col.names = TRUE, quote = FALSE) +} + + +##################################################### +##################################################### +# PROCESS DATA +##################################################### +##################################################### + +post_process_and_export <- function(data, platform, series) { + # keeping only non empty data + if (nrow(data$counts) == 0 || ncol(data$counts) == 0) { + message(paste(platform$id, ': no data found')) + write_warning(paste(platform$id, ": NO DATA")) + return(NULL) + } + # rename columns when needed + counts <- rename_columns(counts, series$mapping) + + export_count_data(data, platform, series) + export_design(data, platform, series) + export_name_mapping(data, platform, series) + export_metadata(data, platform, series) +} + + +process_platform_data <- function(platform, series) { + + platform$metadata <- pData(platform$data) + platform$design <- get_design_for_platform(series$design, platform$metadata) + valid_samples <- as.character(platform$design$sample) + platform$id <- get_platform_id(platform$metadata) + + if (length(valid_samples) == 0) { + message(paste(platform$id, ": no sample corresponding to species", series$species)) + return(NULL) + } + + if (platform$type == "microarray") { + + counts <- get_microarray_counts(platform) + data <- list( counts = counts ) + data$is_valid <- is_valid_microarray(counts, platform) + data$norm_state <- "normalised" + post_process_and_export(data, platform, series) + + } else { + + parsed_counts <- get_all_rnaseq_counts(platform) + for (counts in parsed_counts) { + data <- list( + counts = counts, + is_valid = is_valid_rnaseq(counts, platform), + norm_state = check_rnaseq_normalisation_state(counts, platform) + ) + post_process_and_export(data, platform, series) + } + + } + +} + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +main <- function() { + + args <- get_args() + + series <- list() + + series$accession <- args$accession + series$species <- format_species_name(args$species) + + message(paste("Getting data for accession", series$accession)) + # searching and downloading expression atlas data + geo_data <- download_geo_data_with_retries(series$accession) + + # make a single design dataframe for all samples in the series + series$design <- make_overall_design(geo_data, series) + if ( length(series$design) == 0 ) { + message("No sample corresponding to species", series$species) + write(paste("NO SAMPLES FOR SPECIES", series$species), file = FAILURE_REASON_FILE) + quit(save = "no", status = 0) + } + + # make a map associating sample names to sample IDs + series$mapping <- make_sample_name_mapping(geo_data) + + series$experiment_type <- get_experiment_type(geo_data) + + suppl_data_urls <- get_series_supplementary_data(geo_data, series) + # for now, considering suppl data as raw rnaseq data + # TODO: check if these are always raw rnaseq data + if (length(suppl_data_urls) > 0) { + + message("Processing supplementary data") + for (supp_data_url in suppl_data_urls) { + counts <- get_raw_counts_from_url(supp_data_url) + if (is.null(counts)) { + next + } + platform <- list( + type = "rnaseq", + id = "suppl", + design = series$design + ) + data <- list( + counts = counts, + is_valid = is_valid_rnaseq(counts, platform), + norm_state = check_rnaseq_normalisation_state(counts, platform) + ) + post_process_and_export(data, platform, series) + } + + } + + # NOTE: we consider that a series is either a microarray series OR contains RNA-seq data + # mixed types should be found only in SuperSeries, and it is not handled for now + if ( series$experiment_type == "microarray" ) { + + message("Processing microarray data") + for (i in 1:length(geo_data)) { + platform <- list( + type = "microarray", + data = geo_data[[ i ]] + ) + process_platform_data(platform, series) + } + + } else { + + rnaseq_samples <- get_rnaseq_samples(geo_data, series$design) + if ( series$experiment_type == "rnaseq" || length(rnaseq_samples) > 0 ) { + + message("Processing RNA-seq data") + # taking a subset of the design corresponding to bona-fide RNA-seq samples + rnaseq_design_df <- get_design_for_rnaseq(series$design, rnaseq_samples) + for (i in 1:length(geo_data)) { + platform <- list( + type = "rnaseq", + count_type = "raw", + data = geo_data[[ i ]] + ) + process_platform_data(platform, series) + } + + } else { + write_warning(paste("UNSUPPORTED PLATFORM:", series$experiment_type)) + } + } + + message("Done") +} + + +##################################################### +# ENTRYPOINT +##################################################### +main() diff --git a/bin/download_latest_ensembl_annotation.py b/bin/download_latest_ensembl_annotation.py new file mode 100755 index 00000000..5c7278f1 --- /dev/null +++ b/bin/download_latest_ensembl_annotation.py @@ -0,0 +1,402 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from datetime import datetime +from urllib.request import urlretrieve + +import httpx +import pandas as pd +from bs4 import BeautifulSoup +from tenacity import ( + before_sleep_log, + retry, + stop_after_delay, + wait_exponential, +) +from tqdm import tqdm + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +GENE_IDS_CHUNKSIZE = 50 # max allowed by Ensembl REST API + +ENSEMBL_REST_SERVER = "https://rest.ensembl.org/" +SPECIES_INFO_BASE_ENDPOINT = "info/genomes/taxonomy/{species}" +TAXONOMY_NAME_ENDPOINT = "taxonomy/name/{species}" +ENSEMBL_API_HEADERS = { + "Content-Type": "application/json", + "Accept": "application/json", +} +STOP_RETRY_AFTER_DELAY = 120 + +NCBI_TAXONOMY_API_URL = "https://api.ncbi.nlm.nih.gov/datasets/v2/taxonomy" +NCBI_API_HEADERS = {"accept": "application/json", "content-type": "application/json"} + +ENSEMBL_DIVISION_TO_FOLDER = { + "EnsemblPlants": "plants", + "EnsemblVertebrates": "vertebrates", + "EnsemblMetazoa": "metazoa", + "EnsemblFungi": "fungi", + "EnsemblBacteria": "bacteria", + "EnsemblProtists": "protists", +} + +ENSEMBL_GENOMES_BASE_URL = "https://ftp.ebi.ac.uk/ensemblgenomes/pub/current/{}/gff3/" +ENSEMBL_VERTEBRATES_BASE_URL = "https://ftp.ensembl.org/pub/current/gff3/" + + +################################################################## +################################################################## +# FUNCTIONS +################################################################## +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Get GEO Datasets accessions") + parser.add_argument( + "--species", + type=str, + dest="species", + required=True, + help="Species name", + ) + return parser.parse_args() + + +################################################################## +################################################################## +# httpx +################################################################## +################################################################## + + +@retry( + stop=stop_after_delay(600), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def parse_page_data(url: str) -> BeautifulSoup: + page = httpx.get(url) + page.raise_for_status() + return BeautifulSoup(page.content, "html.parser") + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def send_request_to_ncbi_taxonomy(taxid: str | int): + logger.info(f"Sending POST request to {NCBI_TAXONOMY_API_URL}") + taxons = [str(taxid)] + data = {"taxons": taxons} + response = httpx.post(NCBI_TAXONOMY_API_URL, headers=NCBI_API_HEADERS, json=data) + response.raise_for_status() + return response.json() + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def send_get_request_to_ensembl(url: str) -> list[dict]: + logger.info(f"Sending GET request to {url}") + response = httpx.get(url, headers=ENSEMBL_API_HEADERS) + if response.status_code == 200: + response.raise_for_status() + else: + raise RuntimeError( + f"Failed to retrieve data: encountered error {response.status_code}" + ) + return response.json() + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def download_file(url: str, output_path: str): + try: + urlretrieve(url, output_path) + except Exception as e: + logger.error(f"Failed to download file from {url}: {e}") + raise + + +################################################################## +################################################################## +# PARSING +################################################################## +################################################################## + + +def get_species_taxid(species: str) -> int: + try: + return get_species_taxid_from_ensembl(species) + except Exception as e: + logger.error( + f"Could not get species taxid for species {species} using the Ensembl REST API: {e}.\nTrying NCBI taxonomy." + ) + ncbi_formated_species_name = format_species_name_for_ncbi_taxonomy(species) + return get_species_taxid_from_ncbi(ncbi_formated_species_name) + + +def get_species_taxid_from_ensembl(species: str) -> int: + url = ENSEMBL_REST_SERVER + TAXONOMY_NAME_ENDPOINT.format(species=species) + data = send_get_request_to_ensembl(url) + if len(data) == 0: + raise ValueError(f"No species found for species {species}") + elif len(data) > 1: + logger.warning( + f"Multiple species found for species {species}. Keeping the first one." + ) + species_data = data[0] + if "id" not in species_data: + raise ValueError( + f"Could not find taxid for species {species}. Data collected: {species_data}" + ) + return species_data["id"] + + +def get_species_taxid_from_ncbi(species: str) -> int: + result = send_request_to_ncbi_taxonomy(species) + if len(result["taxonomy_nodes"]) > 1: + raise ValueError(f"Multiple taxids for species {species}") + metadata = result["taxonomy_nodes"][0] + if "taxonomy" not in metadata: + raise ValueError(f"Could not find taxonomy results for species {species}") + return int(metadata["taxonomy"]["tax_id"]) + + +def get_species_division(species_taxid: int) -> str: + url = ENSEMBL_REST_SERVER + SPECIES_INFO_BASE_ENDPOINT.format( + species=str(species_taxid) + ) + data = send_get_request_to_ensembl(url) + if len(data) == 0: + raise ValueError(f"No division found for species Taxon ID {species_taxid}") + elif len(data) > 1: + logger.warning( + f"Multiple divisions found for species Taxon ID {species_taxid}. Keeping the first one." + ) + return data[0]["division"] + + +def get_species_category(species: str) -> str: + species_taxid = get_species_taxid(species) + logger.info(f"Got species taxid: {species_taxid}") + division = get_species_division(species_taxid) + logger.info(f"Got division: {division}") + return ENSEMBL_DIVISION_TO_FOLDER[division] + + +def get_division_url(species: str) -> str: + category = get_species_category(species) + if category == "vertebrates": + return ENSEMBL_VERTEBRATES_BASE_URL + else: + return ENSEMBL_GENOMES_BASE_URL.format(category) + + +def format_species_name_for_ensembl(species: str) -> str: + return species.replace(" ", "_").lower() + + +def format_species_name_for_ncbi_taxonomy(species: str) -> str: + return species.replace("_", " ").lower() + + +def parse_last_modified_date(dt_string: str) -> datetime | None: + try: + return datetime.strptime(dt_string, "%Y-%m-%d %H:%M") + except ValueError: + return None + + +def get_candidate_species_folders( + species: str, url: str, first_level: bool = True +) -> list[dict]: + soup = parse_page_data(url) + species_url_records = [] + + # adding progress bar only at the first level + iterator = tqdm(soup.find_all("tr")) if first_level else soup.find_all("tr") + for item in iterator: + # all line sections + line_sections = list(item.find_all("td")) + # all folders of interest have an associated date + if len(line_sections) < 2: + continue + + folder_name_section = line_sections[1] + date_section = line_sections[2] + last_modified_date = parse_last_modified_date(date_section.text.strip()) + + for folder in folder_name_section.find_all("a"): + folder_url = f"{url}{folder.text}" + if folder.text.startswith(species): + d = { + "date": last_modified_date, + "url": folder_url, + "name": folder.text.rstrip("/"), + } + species_url_records.append(d) + print(folder.text) + elif folder.text.endswith("_collection/"): + species_url_records += get_candidate_species_folders( + species, folder_url, first_level=False + ) + else: + continue + + return species_url_records + + +def get_main_folder_url(records: list[dict], species: str) -> str | None: + main_folder_url = None + for record in records: + if record["name"] == species: + main_folder_url = record["url"] + break + return main_folder_url + + +def get_last_modified_folder_url(records: list[dict]) -> str: + df = pd.DataFrame.from_dict(records) + df.sort_values(by="date", ascending=False, inplace=True) + return df.iloc[0]["url"] + + +def get_current_annotation_folder(records: list[dict], species: str) -> str: + main_folder_url = get_main_folder_url(records, species) + if main_folder_url is not None: + return main_folder_url + + logger.info( + "Could not find a folder having the species as name. Checking for gca folders." + ) + gca_records = [ + record for record in records if record["name"].startswith(f"{species}_gca") + ] + if gca_records: + return get_last_modified_folder_url(gca_records) + + logger.info( + "Could not find a folder having the species as name. Getting the last modified one." + ) + return get_last_modified_folder_url(records) + + +def parse_size(size_str): + """ + Convert size strings like '902K', '4.1M', '5G' to bytes. + + Parameters: + ----------- + size_str : str + Size string with suffix (K, M, G, T, etc.) + + Returns: + -------- + int : size in bytes + """ + size_str = size_str.strip().upper() + + # Define multipliers + multipliers = {"K": 1024, "M": 1024**2, "G": 1024**3, "T": 1024**4, "P": 1024**5} + + # Check if last character is a unit + if size_str[-1] in multipliers: + number = float(size_str[:-1]) + multiplier = multipliers[size_str[-1]] + return int(number * multiplier) + else: + # No suffix, assume it's already in bytes + return int(float(size_str)) + + +def get_annotation_file(url: str) -> str: + soup = parse_page_data(url) + file_records = [] + + for item in soup.find_all("tr"): + # all line sections + line_sections = list(item.find_all("td")) + if len(line_sections) < 4: + continue + + file = line_sections[1].text.strip() + if not file.endswith(".gff3.gz"): + continue + + d = { + "file": file, + "date": parse_last_modified_date(line_sections[2].text.strip()), + "size": parse_size(line_sections[3].text.strip()), + } + file_records.append(d) + + if not file_records: + raise ValueError("No annotation files found") + + df = pd.DataFrame.from_dict(file_records) + + # keeping the biggest annotation + max_size_df = df.loc[ + [df["size"].idxmax()] + ] # double brackets to keep it as a DataFrame + if len(max_size_df) == 1: + return max_size_df["file"].iloc[0] + + # if multiple files with the same size, return the most recent + most_recent_df = max_size_df.loc[ + [max_size_df["date"].idxmax()] + ] # double brackets to keep it as a DataFrame + if len(most_recent_df) == 1: + return max_size_df["file"].iloc[0] + + # if still multiple files, return the first one + # remove the one ending with 'chr.gff3.gz' if it exists + if max_size_df["file"].str.endswith("chr.gff3.gz").any(): + max_size_df = max_size_df[~max_size_df["file"].str.endswith("chr.gff3.gz")] + return max_size_df["file"].iloc[0] + + +################################################################## +################################################################## +# MAIN +################################################################## +################################################################## + + +def main(): + args = parse_args() + + species = format_species_name_for_ensembl(args.species) + division_url = get_division_url(species) + logger.info(f"Searching for the right folder in {division_url}") + + species_url_records = get_candidate_species_folders(species, division_url) + if not species_url_records: + raise ValueError(f"No species folder found for {species}") + + annotation_folder_url = get_current_annotation_folder(species_url_records, species) + logger.info(f"Found current annotation folder: {annotation_folder_url}") + + annotation_file = get_annotation_file(annotation_folder_url) + + annotation_full_url = annotation_folder_url + annotation_file + logger.info(f"Found annotation URL: {annotation_full_url}.\nDownloading...") + + download_file(annotation_full_url, annotation_file) + logger.info("Done") + + +if __name__ == "__main__": + main() diff --git a/bin/download_latest_ncbi_annotation.py b/bin/download_latest_ncbi_annotation.py new file mode 100755 index 00000000..384906dc --- /dev/null +++ b/bin/download_latest_ncbi_annotation.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import shutil +import sys +import zipfile +from pathlib import Path + +import httpx +from tenacity import ( + before_sleep_log, + retry, + stop_after_delay, + wait_exponential, +) + +logging.basicConfig( + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO +) +logger = logging.getLogger(__name__) + +# Modern NCBI API +NCBI_DATASET_API_URL = "https://api.ncbi.nlm.nih.gov/datasets/v2/" + +NCBI_TAXONOMY_ENDPOINT = "taxonomy" +NCBI_GENOME_DATASET_REPORT_BASE_ENDPOINT = "genome/taxon/{taxid}/dataset_report" +NCBI_DOWNLOAD_ENDPOINT = "genome/download" + + +NCBI_GENOME_DATASET_REPORT_API_PARAMS = { + "filters.has_annotation": True, + "page_size": 1000, +} +NCBI_API_HEADERS = {"accept": "application/json", "content-type": "application/json"} + +DOWNLOADED_FILENAME = "ncbi_dataset.zip" +ACCESSION_FILE = "accession.txt" + + +##################################################### +##################################################### +# PARSER +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Get best assembly for a specific taxon ID" + ) + parser.add_argument("--species", type=str, required=True, help="Species name") + return parser.parse_args() + + +##################################################### +##################################################### +# httpx +##################################################### +##################################################### + + +@retry( + stop=stop_after_delay(600), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def send_post_request_to_ncbi_dataset(endpoint: str, data: dict, params: dict = {}): + url = NCBI_DATASET_API_URL + endpoint + response = httpx.post(url, headers=NCBI_API_HEADERS, json=data, params=params) + response.raise_for_status() + return response.json() + + +@retry( + stop=stop_after_delay(600), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def send_get_request_to_ncbi_dataset(endpoint: str, params: dict = {}): + url = NCBI_DATASET_API_URL + endpoint + response = httpx.get(url, headers=NCBI_API_HEADERS, params=params) + response.raise_for_status() + return response.json() + + +##################################################### +##################################################### +# DATA HANDLING +##################################################### +##################################################### + + +def get_species_taxid(species: str) -> int: + data = {"taxons": [species]} + result = send_post_request_to_ncbi_dataset(NCBI_TAXONOMY_ENDPOINT, data) + + if len(result["taxonomy_nodes"]) > 1: + raise ValueError(f"Multiple taxids for species {species}") + metadata = result["taxonomy_nodes"][0] + + if "taxonomy" not in metadata: + logger.info(f"Could not find taxonomy results for species {species}") + if "errors" in metadata: + for error in metadata["errors"]: + logger.error(f"Error: {error['reason']}\n") + sys.exit(100) + return int(metadata["taxonomy"]["tax_id"]) + + +def get_assembly_reports(taxid: int): + result = send_get_request_to_ncbi_dataset( + endpoint=NCBI_GENOME_DATASET_REPORT_BASE_ENDPOINT.format(taxid=taxid), + params=NCBI_GENOME_DATASET_REPORT_API_PARAMS, + ) + return result.get("reports", []) + + +def get_assembly_with_best_stats(reports: list[dict]): + sorted_reports = sorted( + reports, + key=lambda x: ( + int(x.get("assembly_stats").get("total_sequence_length", 0)), + -int(x.get("assembly_stats", {}).get("total_number_of_chromosomes", 1e9)), + ), + reverse=True, + ) + return sorted_reports[0] + + +def get_current_assemblies(reports: list[dict]) -> dict | None: + current_assembly_reports = [ + report + for report in reports + if report.get("assembly_info", {}).get("refseq_category") == "reference genome" + ] + if not current_assembly_reports: + return None + + refseq_reports = [ + report + for report in current_assembly_reports + if report.get("source_database") == "SOURCE_DATABASE_REFSEQ" + ] + + if refseq_reports: + return refseq_reports[0] + else: + return None + + +def get_reference_assembly(reports: list[dict]) -> dict: + best_assembly_report = get_current_assemblies(reports) + if best_assembly_report is not None: + return best_assembly_report + else: + return get_assembly_with_best_stats(reports) + + +def format_species_name(species: str): + return species.replace("_", " ").lower() + + +def download_genome_annotation(genome_accession: str) -> str: + data = {"accessions": [genome_accession], "include_annotation_type": ["GENOME_GFF"]} + params = {"filename": DOWNLOADED_FILENAME} + send_post_request_to_ncbi_dataset(NCBI_TAXONOMY_ENDPOINT, data, params) + if not Path(DOWNLOADED_FILENAME).exists(): + raise FileNotFoundError( + f"Downloaded file not found for accession {genome_accession}" + ) + + +def extract_annotation_file_from_archive(): + with zipfile.ZipFile(DOWNLOADED_FILENAME, "r") as zip_ref: + zip_ref.extractall() + + valid_files = list(Path().cwd().glob(f"ncbi_dataset/data/{accession}/*.gff")) + + if not valid_files: + raise ValueError(f"No annotation file found for accession {accession}") + + if len(valid_files) > 1: + logger.warning( + f"Multiple annotation files found for accession {accession}. Taking the first one" + ) + + annotation_file = valid_files[0] + shutil.move(annotation_file, f"{accession}.gff") + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + +if __name__ == "__main__": + args = parse_args() + + species = format_species_name(args.species) + + species_taxid = get_species_taxid(species) + logger.info(f"Species taxid: {species_taxid}") + + logger.info(f"Getting best NCBI assembly for taxid: {species_taxid}") + reports = get_assembly_reports(species_taxid) + + if not reports: + logger.error(f"No assembly reports found for taxid {species_taxid}") + sys.exit(100) + + # looping while we can get an annotation file + annotation_found = False + while not annotation_found and reports: + best_assembly_report = get_reference_assembly(reports) + logger.info( + f"Best assembly: {best_assembly_report['accession']}. Trying to download annotation" + ) + accession = best_assembly_report["accession"] + try: + download_genome_annotation(accession) + extract_annotation_file_from_archive() + annotation_found = True + except Exception as e: + logger.error(f"Error downloading annotation for accession {accession}: {e}") + + if not annotation_found: + # Remove the best assembly report from the list of reports + reports = [report for report in reports if report["accession"] != accession] + + if not annotation_found: + logger.error(f"No annotation found for taxid {species_taxid}") + sys.exit(100) + + logger.info("Done") diff --git a/bin/extract_gene_ids.py b/bin/extract_gene_ids.py new file mode 100755 index 00000000..f09f01c8 --- /dev/null +++ b/bin/extract_gene_ids.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +from common import parse_count_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +CLEANED_GENE_IDS_SUFFIX = ".gene_ids.txt" + + +def parse_args(): + parser = argparse.ArgumentParser("Rename gene IDs using mapped IDs") + parser.add_argument( + "--count-file", type=Path, required=True, help="Input file containing counts" + ) + return parser.parse_args() + + +def get_sorted_gene_ids(df: pl.DataFrame): + return ( + df.select(config.GENE_ID_COLNAME) + .sort(config.GENE_ID_COLNAME) + .to_series() + .to_list() + ) + + +################################################################## +# MAIN +################################################################## + + +def main(): + args = parse_args() + + logger.info(f"Converting IDs for count file {args.count_file.name}...") + + df = parse_count_table(args.count_file) + + logger.info("Writing cleaned IDs") + gene_ids_outfile = args.count_file.with_name( + args.count_file.stem + CLEANED_GENE_IDS_SUFFIX + ) + gene_ids = get_sorted_gene_ids(df) + + with open(gene_ids_outfile, "w") as fout: + fout.write("\n".join(gene_ids)) + + +if __name__ == "__main__": + main() diff --git a/bin/filter_and_rename_genes.py b/bin/filter_and_rename_genes.py new file mode 100755 index 00000000..106f8013 --- /dev/null +++ b/bin/filter_and_rename_genes.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from pathlib import Path + +import config +import polars as pl +from common import parse_count_table, parse_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +################################################################## +# CONSTANTS +################################################################## + +RENAMED_FILE_SUFFIX = ".renamed.parquet" + +WARNING_REASON_FILE = "warning_reason.txt" +FAILURE_REASON_FILE = "failure_reason.txt" + +UNMAPPED_FILE_SUFFIX = "unmapped.txt" +NOT_VALID_FILE_SUFFIX = "not_valid.txt" +MERGED_FILE_SUFFIX = "merged.txt" +FINAL_FILE_SUFFIX = "final.txt" + +################################################################## +# FUNCTIONS +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Rename gene IDs using mapped IDs") + parser.add_argument( + "--count-file", type=Path, required=True, help="Input file containing counts" + ) + parser.add_argument( + "--mappings", + type=Path, + dest="mapping_file", + help="Mapping file containing gene IDs", + ) + parser.add_argument( + "--valid-gene-ids", + type=Path, + dest="valid_gene_ids_file", + help="File containing valid gene IDs", + ) + return parser.parse_args() + + +################################################################## +# MAIN +################################################################## + + +def main(): + args = parse_args() + + logger.info(f"Converting IDs for count file {args.count_file.name}...") + + ############################################################# + # PARSING FILES + ############################################################# + + df = parse_count_table(args.count_file) + + if df.is_empty(): + msg = "COUNT FILE IS EMPTY" + logger.warning(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + sys.exit(0) + + ############################################################# + # GETTING MAPPINGS + ############################################################# + + mapping_df = parse_table(args.mapping_file) + mapping_dict = dict( + zip( + mapping_df[config.ORIGINAL_GENE_ID_COLNAME], + mapping_df[config.GENE_ID_COLNAME], + ) + ) + + ############################################################# + # MAPPING GENE IDS IN DATAFRAME + ############################################################# + + # IMPORTANT: KEEPING ONLY GENES THAT HAVE BEEN CONVERTED + # filtering the DataFrame to keep only the rows where the index can be mapped + original_nb_genes = len(df) + + rejected_df = df.filter(~pl.col(config.GENE_ID_COLNAME).is_in(mapping_dict.keys())) + nb_unmapped_genes = len(rejected_df) + + # df = df.loc[df.index.isin(mapping_dict)] + df = df.filter(pl.col(config.GENE_ID_COLNAME).is_in(mapping_dict.keys())) + nb_mapped_genes = len(df) + + with open(UNMAPPED_FILE_SUFFIX, "w") as f: + f.write(str(nb_unmapped_genes)) + + if df.is_empty(): + sample_size = min(5, nb_unmapped_genes) + example_rejected_genes = ( + rejected_df[config.GENE_ID_COLNAME].head(sample_size).to_list() + ) + msg = f"NO GENES WERE MAPPED. EXAMPLE OF GENE IDS: {example_rejected_genes}" + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + + with open(NOT_VALID_FILE_SUFFIX, "w") as f: + f.write("0") + with open(MERGED_FILE_SUFFIX, "w") as f: + f.write("0") + with open(FINAL_FILE_SUFFIX, "w") as f: + f.write("0") + + sys.exit(0) + + if len(df) < original_nb_genes: + sample_size = min(5, nb_unmapped_genes) + example_rejected_genes = ( + rejected_df[config.GENE_ID_COLNAME].head(sample_size).to_list() + ) + msg = ( + f"{nb_mapped_genes / original_nb_genes:.2%} of genes were mapped ({nb_mapped_genes} out of {original_nb_genes}). " + + f"Example of unmapped genes: {example_rejected_genes}" + ) + logger.warning(msg) + with open(WARNING_REASON_FILE, "a") as f: + f.write(msg) + else: + logger.info( + f"All genes were mapped ({nb_mapped_genes} out of {original_nb_genes})" + ) + + logger.info("Renaming gene names") + # renaming gene names to mapped ids using mapping dict + df = df.with_columns( + pl.col(config.GENE_ID_COLNAME) + .replace(mapping_dict) + .alias(config.GENE_ID_COLNAME) + ) + + ############################################################# + # GETTING VALID GENE IDS + ############################################################# + + logger.info("Keeping only genes with sufficient occurrence over datasets") + nb_genes_before_validation = len(df) + + with open(args.valid_gene_ids_file, "r") as fin: + valid_gene_ids = [line.strip() for line in fin.readlines()] + + df = df.filter(pl.col(config.GENE_ID_COLNAME).is_in(valid_gene_ids)) + + nb_not_valid_genes = nb_genes_before_validation - len(df) + logger.info( + f"{nb_not_valid_genes} ({nb_not_valid_genes / nb_genes_before_validation:.2%}) genes were not valid" + ) + + with open(NOT_VALID_FILE_SUFFIX, "w") as f: + f.write(str(nb_not_valid_genes)) + + if df.is_empty(): + msg = "NO GENES LEFT AFTER REMOVING RARE GENE IDS" + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as f: + f.write(msg) + + with open(MERGED_FILE_SUFFIX, "w") as f: + f.write("0") + with open(FINAL_FILE_SUFFIX, "w") as f: + f.write("0") + + sys.exit(0) + + ############################################################# + # GENE COUNT HANDLING + ############################################################# + + # handling cases where multiple genes have the same Gene ID + # since subsequent steps in the pipeline require integer values, + # we need to ensure that the resulting DataFrame has integer values + + # TODO: check is there is another way to avoid duplicate gene names + # sometimes different gene names have the same Gene ID + # for now, we just get the max of values, but this is not ideal + # we do not take the mean because if counts are integers, we want to keep them as integers + + logger.info("Computing max counts for genes with duplicate IDs") + df = df.group_by(config.GENE_ID_COLNAME, maintain_order=True).agg( + pl.exclude(config.GENE_ID_COLNAME).max() + ) + + ############################################################# + # WRITING OUTFILES + ############################################################# + + nb_merged = nb_mapped_genes - len(df) + with open(MERGED_FILE_SUFFIX, "w") as f: + f.write(str(nb_merged)) + with open(FINAL_FILE_SUFFIX, "w") as f: + f.write(str(len(df))) + + logger.info("Writing output file") + outfilename = args.count_file.with_suffix(RENAMED_FILE_SUFFIX).name + df.write_parquet(outfilename) + + +if __name__ == "__main__": + main() diff --git a/bin/filter_out_samples_with_too_many_missing_values.py b/bin/filter_out_samples_with_too_many_missing_values.py new file mode 100755 index 00000000..973111b4 --- /dev/null +++ b/bin/filter_out_samples_with_too_many_missing_values.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +from common import export_parquet, parse_count_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +OUTFILE_SUFFIX = ".nulls_filtered.parquet" +RATIO_NULL_VALUES_PER_SAMPLE_OUTFILE = "ratio_null_values_per_sample.csv" +RATIO_NULL_VALUES_OUTFILE = "ratio_null_values.csv" +NB_REJECTED_SAMPLES_OUTFILE = "nb_rejected_samples.csv" +NB_KEPT_SAMPLES_OUTFILE = "nb_kept_samples.csv" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Filter out samples not valid") + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--max-null-ratio", + type=float, + dest="max_null_ratio", + required=True, + help="Maximum ratio of null values", + ) + parser.add_argument( + "--valid-gene-ids", + type=Path, + dest="valid_gene_ids", + required=True, + help="Valid gene IDs", + ) + return parser.parse_args() + + +def get_nb_valid_genes(valid_gene_ids_file: Path) -> int: + with open(valid_gene_ids_file, "r") as fin: + return len(fin.readlines()) + + +def get_nb_internal_nulls(df: pl.DataFrame) -> pl.DataFrame: + """ + Get the number of null values per sample. + :return: + A polars dataframe containing 2 columns: + - sample: name of the sample + - nb_nulls: number of null values + """ + return df.select(pl.exclude(config.GENE_ID_COLNAME).is_null().sum()).transpose( + include_header=True, + header_name=config.SAMPLE_COLNAME, + column_names=[config.GENE_COUNT_COLNAME], + ) + + +def get_ratio_null_values( + df: pl.DataFrame, nb_missing_genes: int, nb_valid_genes: int +) -> pl.DataFrame: + return df.select( + pl.col(config.SAMPLE_COLNAME), + ( + (pl.col(config.GENE_COUNT_COLNAME) + pl.lit(nb_missing_genes)) + / nb_valid_genes + ).alias(config.RATIO_COLNAME), + ) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + # putting all counts into a single dataframe + logger.info("Loading count data...") + count_df = parse_count_table(args.count_file) + nb_genes = len(count_df) + nb_samples = count_df.shape[1] - 1 + logger.info(f"Loaded count data with {nb_genes} genes and {nb_samples} samples") + + logger.info("Computing total number of nulls per sample") + + # getting nb of missing values inside the dataframe (rare but may exist) + nb_null_values_df = get_nb_internal_nulls(count_df) + + # getting nb of missing valid genes inside the dataframe + nb_valid_genes = get_nb_valid_genes(args.valid_gene_ids) + nb_missing_genes = nb_valid_genes - nb_genes + + # adding the nb of missing genes to the number of null vaues for each sample + ratio_values_df = get_ratio_null_values( + nb_null_values_df, nb_missing_genes, nb_valid_genes + ) + + valid_samples = ( + ratio_values_df.filter(pl.col(config.RATIO_COLNAME) <= args.max_null_ratio) + .select(pl.col(config.SAMPLE_COLNAME)) + .to_series() + .to_list() + ) + + # if at least one valid sample is remaining, making an updated count dataframe + if valid_samples: + logger.info(f"Filtered out {count_df.shape[1] - len(valid_samples)} columns") + valid_count_df = count_df.select([config.GENE_ID_COLNAME] + valid_samples) + export_parquet(valid_count_df, args.count_file, OUTFILE_SUFFIX) + else: + logger.error("No valid columns remaining") + + # collect all ratio values for export + ratio_values = ratio_values_df.select(config.RATIO_COLNAME).to_series().to_list() + with open(RATIO_NULL_VALUES_OUTFILE, "w") as outfile: + # sorting values in order to having consistent output + outfile.write(",".join([str(val) for val in sorted(ratio_values)])) + + ratio_values_df.write_csv(RATIO_NULL_VALUES_PER_SAMPLE_OUTFILE) + + with open(NB_KEPT_SAMPLES_OUTFILE, "w") as fout: + fout.write(str(len(valid_samples))) + + with open(NB_REJECTED_SAMPLES_OUTFILE, "w") as fout: + fout.write(str(nb_samples - len(valid_samples))) + + logger.info("Done") + + +if __name__ == "__main__": + main() diff --git a/bin/filter_out_samples_with_too_many_zeros.py b/bin/filter_out_samples_with_too_many_zeros.py new file mode 100755 index 00000000..7708b0ad --- /dev/null +++ b/bin/filter_out_samples_with_too_many_zeros.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +from common import export_parquet, parse_count_table + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +OUTFILE_SUFFIX = ".zeros_filtered.parquet" +RATIO_ZEROS_PER_SAMPLE_OUTFILE = "ratio_zeros_per_sample.csv" +RATIO_ZERO_VALUES_OUTFILE = "ratio_zeros.csv" +NB_REJECTED_SAMPLES_OUTFILE = "nb_rejected_samples.csv" +NB_KEPT_SAMPLES_OUTFILE = "nb_kept_samples.csv" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Filter out samples not valid") + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--max-zero-ratio", + type=float, + dest="max_zero_ratio", + required=True, + help="Maximum ratio of zeros allowed", + ) + return parser.parse_args() + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + # putting all counts into a single dataframe + logger.info("Loading count data...") + count_df = parse_count_table(args.count_file) + nb_samples = count_df.shape[1] - 1 + logger.info( + f"Loaded count data with {len(count_df)} genes and {nb_samples} samples" + ) + + # computing the number of zeros values per sample + ratio_zeros_df = count_df.select( + pl.exclude(config.GENE_ID_COLNAME).eq(pl.lit(0)).mean() + ) + + # getting the samples with a zero ratio lower than the max zero ratio + valid_samples = [ + col + for col in ratio_zeros_df.columns + if ratio_zeros_df[col][0] <= args.max_zero_ratio + ] + + # if at least one valid sample is remaining, making an updated count dataframe + if valid_samples: + logger.info(f"Filtered out {count_df.shape[1] - len(valid_samples)} columns") + valid_count_df = count_df.select( + pl.col(config.GENE_ID_COLNAME), pl.col(valid_samples) + ) + export_parquet(valid_count_df, args.count_file, OUTFILE_SUFFIX) + else: + logger.error("No valid columns remaining") + + # collect all ratio values for export + ratio_values = list(ratio_zeros_df.row(0)) + with open(RATIO_ZERO_VALUES_OUTFILE, "w") as outfile: + # sorting values in order to having consistent output + outfile.write(",".join([str(val) for val in sorted(ratio_values)])) + + ratio_zeros_df.write_csv(RATIO_ZEROS_PER_SAMPLE_OUTFILE) + + with open(NB_KEPT_SAMPLES_OUTFILE, "w") as fout: + fout.write(str(len(valid_samples))) + + with open(NB_REJECTED_SAMPLES_OUTFILE, "w") as fout: + fout.write(str(nb_samples - len(valid_samples))) + + logger.info("Done") + + +if __name__ == "__main__": + main() diff --git a/bin/get_candidate_genes.py b/bin/get_candidate_genes.py new file mode 100755 index 00000000..c0f8c58b --- /dev/null +++ b/bin/get_candidate_genes.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# outfile names +CANDIDATE_COUNTS_OUTFILENAME = "section_{}.candidate_counts.parquet" +STATS_WITH_SECTION_OUTFILENAME = "section_{}.stats.parquet" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Get statistics from count data for each gene" + ) + parser.add_argument( + "--counts", + type=Path, + dest="count_file", + required=True, + help="File containing counts for all genes", + ) + parser.add_argument( + "--stats", + type=Path, + dest="stat_file", + required=True, + help="File containing statistics of expression over all datasets", + ) + parser.add_argument( + "--nb-candidates-per-section", + type=int, + dest="nb_candidates_per_section", + required=True, + help="Number of candidates per section to select for subsequent steps", + ) + parser.add_argument( + "--nb-sections", + type=int, + dest="nb_sections", + required=True, + help="Number of sections to divide the data into", + ) + return parser.parse_args() + + +def parse_stats(file: Path) -> pl.DataFrame: + return pl.read_csv(file).select( + pl.col(config.GENE_ID_COLNAME).cast(pl.String()), + pl.exclude(config.GENE_ID_COLNAME).cast(pl.Float64()), + ) + + +def add_sections(stat_df: pl.DataFrame, nb_sections: int): + """ + Assigns gene to sections bases on mean expression level + Polars only ranks non-null values and preserves the null ones. + """ + return stat_df.with_columns( + ( + pl.col(config.MEAN_COLNAME).rank(method="ordinal", descending=True) + / pl.col(config.MEAN_COLNAME).count() + * nb_sections + + pl.lit(1) + ) + .floor() + .cast(pl.Int8) + # we want the only value at to be at + .replace({nb_sections + 1: nb_sections}) + .alias("section") + ) + + +def get_best_candidates( + stat_df: pl.DataFrame, nb_candidates_per_section: int +) -> pl.DataFrame: + return ( + stat_df.sort( + config.COEFFICIENT_OF_VARIATION_COLNAME, + descending=False, + nulls_last=True, + maintain_order=True, + ) + .group_by("section", maintain_order=True) + .agg(pl.col(config.GENE_ID_COLNAME).head(nb_candidates_per_section)) + ) + + +def get_counts_for_candidates(file: Path, best_candidates: list[str]) -> pl.DataFrame: + return pl.read_parquet(file).filter( + pl.col(config.GENE_ID_COLNAME).is_in(best_candidates) + ) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + stat_df = parse_stats(args.stat_file) + + # first basic filters + # stat_df = filter_out_low_expression_genes(stat_df, args.min_pct_quantile_expr_level) + # stat_lf = filter_out_genes_with_zero_counts(stat_lf) + + logger.info("Getting sections") + stat_df = add_sections(stat_df, args.nb_sections) + + logger.info("Getting best candidates") + # get base candidate genes based on the chosen statistical descriptor (cv, rcvm) + best_candidates_df = get_best_candidates( + stat_df, + args.nb_candidates_per_section, + ) + + logger.info("Getting counts of best candidates") + # this was coded as a loop in order to keep it simple + # since it does not impact much speed and scability + for row in best_candidates_df.iter_rows(): + section = row[0] + best_candidates = row[1] + candidate_gene_count_lf = get_counts_for_candidates( + args.count_file, best_candidates + ) + # exporting count data for the best candidates for this section + candidate_gene_count_lf.write_parquet( + CANDIDATE_COUNTS_OUTFILENAME.format(section) + ) + # exporting statistics for all genes in this section + stat_df.filter(pl.col("section") == section).write_parquet( + STATS_WITH_SECTION_OUTFILENAME.format(section) + ) + + +if __name__ == "__main__": + main() diff --git a/bin/get_eatlas_accessions.py b/bin/get_eatlas_accessions.py new file mode 100755 index 00000000..2b8b2dde --- /dev/null +++ b/bin/get_eatlas_accessions.py @@ -0,0 +1,539 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import random +from functools import partial +from multiprocessing import Pool + +import httpx +import pandas as pd +import yaml +from natural_language_utils import keywords_in_fields +from tenacity import ( + before_sleep_log, + retry, + stop_after_delay, + wait_exponential, +) + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +ALLOWED_PLATFORMS = ["rnaseq", "microarray"] +# accessions that should not be fetched automatically: +# - E-GTEX-8 contains 17350 samples (way too big) +EXCLUDED_ACCESSION_PATTERNS = ["E-GTEX-"] + +ALL_EXP_URL = "https://www.ebi.ac.uk/gxa/json/experiments/" +ACCESSION_OUTFILE_NAME = "accessions.txt" +# ALL_EXPERIMENTS_METADATA_OUTFILE_NAME = "all_experiments.metadata.tsv" +SPECIES_EXPERIMENTS_METADATA_OUTFILE_NAME = "species_experiments.metadata.tsv" +SELECTED_EXPERIMENTS_METADATA_OUTFILE_NAME = "selected_experiments.metadata.tsv" +FILTERED_EXPERIMENTS_WITH_KEYWORDS_OUTFILE_NAME = "filtered_experiments.keywords.yaml" + +SAMPLING_QUOTA_OUTFILE = "sampling_quota.txt" + + +################################################################## +################################################################## +# FUNCTIONS +################################################################## +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Get expression atlas accessions") + parser.add_argument( + "--species", + type=str, + required=True, + help="Search Expression Atlas for this specific species", + ) + parser.add_argument( + "--keywords", + type=str, + nargs="*", + help="Keywords to search for in experiment description", + ) + parser.add_argument( + "--platform", type=str, help="Platform type", choices=ALLOWED_PLATFORMS + ) + parser.add_argument( + "--random-sampling-size", + dest="random_sampling_size", + type=int, + help="Random sampling size", + ) + parser.add_argument( + "--random-sampling-seed", + dest="random_sampling_seed", + type=int, + help="Random sampling seed", + ) + parser.add_argument( + "--cpus", type=int, dest="nb_cpus", required=True, help="Number of CPUs" + ) + return parser.parse_args() + + +@retry( + stop=stop_after_delay(600), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def get_data(url: str) -> dict: + """ + Queries a URL and returns the data as a JSON object + + Parameters + ---------- + url : str + The URL to query + + Returns + ------- + data : dict + The JSON object returned by the query + + Raises + ------ + RuntimeError + If the query fails + """ + response = httpx.get(url) + response.raise_for_status() + return response.json() + + +def get_experiment_description(exp_dict: dict) -> str: + """ + Gets the description from an experiment dictionary + + Parameters + ---------- + exp_dict : dict + The experiment dictionary + + Returns + ------- + description : str + The experiment description + + Raises + ------ + KeyError + If the description field is not found in the experiment dictionary + """ + if "experiment" in exp_dict: + if "description" in exp_dict["experiment"]: + return exp_dict["experiment"]["description"] + else: + raise KeyError(f"Could not find description field in {exp_dict}") + elif "experimentDescription" in exp_dict: + return exp_dict["experimentDescription"] + else: + raise KeyError(f"Could not find description field in {exp_dict}") + + +def get_experiment_accession(exp_dict: dict) -> str: + """ + Gets the accession from an experiment dictionary + + Parameters + ---------- + exp_dict : dict + The experiment dictionary + + Returns + ------- + accession : str + The experiment accession + + Raises + ------ + KeyError + If the accession field is not found in the experiment dictionary + """ + if "experiment" in exp_dict: + if "accession" in exp_dict["experiment"]: + return exp_dict["experiment"]["accession"] + else: + raise KeyError(f"Could not find accession field in {exp_dict}") + elif "experimentAccession" in exp_dict: + return exp_dict["experimentAccession"] + else: + raise KeyError(f"Could not find accession field in {exp_dict}") + + +def get_properties_values(exp_dict: dict) -> list: + """ + Gets all values from properties from an experiment dictionary + + Parameters + ---------- + exp_dict : dict + The experiment dictionary + + Returns + ------- + values : list + A list of all values from properties + """ + values = [] + for column_header_dict in exp_dict["columnHeaders"]: + key_found = False + for key in ["assayGroupSummary", "contrastSummary"]: + if key in column_header_dict: + for property_dict in column_header_dict[key]["properties"]: + values.append(property_dict["testValue"]) + key_found = True + break + if not key_found: + raise KeyError(f"Could not find property value in {column_header_dict}") + # removing empty strings + values = [value for value in values if value != ""] + # removing duplicates + return list(set(values)) + + +def get_eatlas_experiments() -> list[dict]: + """ + Gets all experiments from Expression Atlas + + Parameters + ---------- + + Returns + ------- + experiments : list + A list of experiment dictionaries + """ + data = get_data(ALL_EXP_URL) + return data["experiments"] + + +def filter_by_platform(experiments: list[dict], platform: str | None) -> list[dict]: + """ + Gets all experiments for a given platform from Expression Atlas + Possible platforms in Expression Atlas are 'rnaseq', 'microarray', 'proteomics' + + Parameters + ---------- + experiments: list[str] + platform : str + Name of platform. Example: "rnaseq" + + Returns + ------- + experiments : list + A list of experiment dictionaries + """ + platform_experiments = [] + for exp_dict in experiments: + if technology_type := exp_dict.get("technologyType"): + parsed_technology_type = ( + technology_type[0] + if isinstance(technology_type, list) + else technology_type + ) + # parsed_platform is in ["rnaseq", "microarray", "proteomics", ...] + parsed_platform = ( + parsed_technology_type.lower().split(" ")[0].replace("-", "") + ) + + if platform is not None: + if parsed_platform == platform: + platform_experiments.append(exp_dict) + else: + if parsed_platform in ALLOWED_PLATFORMS: + platform_experiments.append(exp_dict) + + else: + logger.warning( + f"Technology type not found for experiment {exp_dict['accession']}" + ) + return platform_experiments + + +def get_species_name_without_subspecies(species: str) -> str: + """ + Returns the species name without the subspecies part. + Ex: Hordeum vulgare subsp. vulgare -> Hordeum vulgare + """ + return " ".join(species.split(" ")[:2]) + + +def get_species_experiments(experiments: list[dict], species: str) -> list[dict]: + """ + Gets all experiments for a given species from Expression Atlas + + Parameters + ---------- + experiments: list[str] + species : str + Name of species. Example: "Arabidopsis thaliana" + + Returns + ------- + experiments : list + A list of experiment dictionaries + """ + species_experiments = [] + for exp_dict in experiments: + # in case the species name contains a subspecies part, we only use the first two words + exp_species = get_species_name_without_subspecies(exp_dict["species"]) + if exp_species == species: + species_experiments.append(exp_dict) + return species_experiments + + +def get_experiment_data(exp_dict: dict) -> dict: + """ + Gets the full data for an experiment given its dictionary + + Parameters + ---------- + exp_dict : dict + The experiment dictionary + + Returns + ------- + exp_data : dict + The full experiment data + """ + exp_url = ALL_EXP_URL + exp_dict["experimentAccession"] + return get_data(exp_url) + + +def filter_out_excluded_accessions(experiments: list[dict]) -> list[dict]: + valid_experiments = [] + for exp_dict in experiments: + for accession_pattern in EXCLUDED_ACCESSION_PATTERNS: + if exp_dict["experimentAccession"].startswith(accession_pattern): + logger.warning( + f"Skipping experiment {exp_dict['experimentAccession']} due to exclusion pattern" + ) + break + else: + valid_experiments.append(exp_dict) + return valid_experiments + + +def parse_experiment(exp_dict: dict) -> dict: + # getting accession and description + accession = get_experiment_accession(exp_dict) + description = get_experiment_description(exp_dict) + # getting properties of this experiment + exp_data = get_experiment_data(exp_dict) + properties_values = get_properties_values(exp_data) + + return { + "accession": accession, + "description": description, + "properties": properties_values, + } + + +def filter_experiment_with_keywords(exp_dict: dict, keywords: list[str]) -> dict | None: + all_searchable_fields = [exp_dict["description"]] + exp_dict["properties"] + found_keywords = keywords_in_fields(all_searchable_fields, keywords) + # only returning experiments if found keywords + if found_keywords: + exp_dict["found_keywords"] = list(set(found_keywords)) + return exp_dict + else: + return None + + +def get_metadata_for_selected_experiments( + experiments: list[dict], results: list[dict] +) -> list[dict]: + filtered_accessions = [result_dict["accession"] for result_dict in results] + return [ + exp_dict + for exp_dict in experiments + if get_experiment_accession(exp_dict) in filtered_accessions + ] + + +def sample_experiments_randomly( + experiments: list[dict], sampling_size: int, seed: int +) -> tuple[list[str], bool]: + random.seed(seed) + sampled_experiments = [] + + total_nb_samples = 0 + sampling_quota_reached = False + experiments_left = list(experiments) + while experiments_left: + # if the min number of samples is greater than the remaining space left, we get out of the loop + experiments_left_nb_samples = [exp["nb_samples"] for exp in experiments_left] + min_nb_samples = min(experiments_left_nb_samples) + if min_nb_samples > sampling_size - total_nb_samples: + sampling_quota_reached = True + logger.warning("Sampling quota reached") + break + + experiment = None + test_total_nb_samples = int(total_nb_samples) + experiments_not_tested = list(experiments_left) + while experiments_not_tested: + experiment = random.choice(experiments_not_tested) + experiments_not_tested.remove(experiment) + # if we do not exceed the sampling size with this experiment + # we keep it + test_total_nb_samples = total_nb_samples + experiment["nb_samples"] + if test_total_nb_samples <= sampling_size: + break + + # this should not happen but we keep it for safety + if experiment is None: + logger.error("No experiment found") + continue + + total_nb_samples = test_total_nb_samples + experiments_left.remove(experiment) + sampled_experiments.append(experiment) + + return [exp["accession"] for exp in sampled_experiments], sampling_quota_reached + + +def format_species_name(species: str) -> str: + return species.replace("_", " ").capitalize().strip() + + +################################################################## +################################################################## +# MAIN +################################################################## +################################################################## + + +def main(): + args = parse_args() + + results = None + selected_accessions = [] + selected_experiments = [] + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # PARSING EXPRESSION ATLAS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + # Getting arguments + species_name = format_species_name(args.species) + keywords = args.keywords + + logger.info(f"Getting experiments corresponding to species {species_name}") + experiments = get_eatlas_experiments() + + logger.info("Filtering on species name") + experiments = get_species_experiments(experiments, species_name) + logger.info(f"Found {len(experiments)} experiments for species {species_name}") + + logger.info("Filtering experiments based on platform") + experiments = filter_by_platform(experiments, args.platform) + + logger.info("Filtering out excluded accessions") + experiments = filter_out_excluded_accessions(experiments) + + logger.info("Parsing experiments") + with Pool(processes=args.nb_cpus) as pool: + results = pool.map(parse_experiment, experiments) + + if keywords: + logger.info(f"Filtering experiments with keywords {keywords}") + func = partial(filter_experiment_with_keywords, keywords=keywords) + with Pool(processes=args.nb_cpus) as pool: + results = [res for res in pool.map(func, results) if res is not None] + logger.info( + f"Found {len(results)} experiments corresponding to keywords {keywords}" + ) + + # getting accessions of selected experiments + selected_accessions = [exp_dict["accession"] for exp_dict in results] + + sampling_status = "ok" + if args.random_sampling_size and args.random_sampling_seed: + selected_accession_to_nb_samples = [ + { + "accession": exp_dict["experimentAccession"], + "nb_samples": exp_dict["numberOfAssays"], + } + for exp_dict in experiments + if exp_dict["experimentAccession"] in selected_accessions + ] + + nb_samples_df = pd.DataFrame.from_dict(selected_accession_to_nb_samples) + nb_samples_df.to_csv("selected_accession_to_nb_samples.csv", index=False) + + logger.info("Sampling experiments randomly") + selected_accessions, sampling_quota_reached = sample_experiments_randomly( + selected_accession_to_nb_samples, + args.random_sampling_size, + args.random_sampling_seed, + ) + logger.info( + f"Kept {len(selected_accessions)} experiments after random sampling" + ) + + if sampling_quota_reached: + sampling_status = "full" + + # writing status to file + # so that the wrapper module can get the status + with open(SAMPLING_QUOTA_OUTFILE, "w") as fout: + fout.write(sampling_status) + + # keeping metadata only for selected experiments + selected_experiments = get_metadata_for_selected_experiments(experiments, results) + + if not selected_accessions: + logger.warning( + f"Could not find experiments for species {species_name} and keywords {keywords}" + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # EXPORTING DATA + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + # exporting list of accessions + logger.info(f"Writing accessions to {ACCESSION_OUTFILE_NAME}") + with open(ACCESSION_OUTFILE_NAME, "w") as fout: + fout.writelines([f"{acc}\n" for acc in selected_accessions]) + + # exporting metadata + logger.info( + f"Writing metadata of all experiments for species {species_name} to {SPECIES_EXPERIMENTS_METADATA_OUTFILE_NAME}" + ) + df = pd.DataFrame.from_dict(experiments) + df.to_csv( + SPECIES_EXPERIMENTS_METADATA_OUTFILE_NAME, sep="\t", index=False, header=True + ) + + if selected_experiments: + logger.info( + f"Writing metadata of filtered experiments to {SELECTED_EXPERIMENTS_METADATA_OUTFILE_NAME}" + ) + df = pd.DataFrame.from_dict(selected_experiments) + df.to_csv( + SELECTED_EXPERIMENTS_METADATA_OUTFILE_NAME, + sep="\t", + index=False, + header=True, + ) + + if results: + # exporting list of selected experiments with their keywords + logger.info( + f"Writing filtered experiments with keywords to {FILTERED_EXPERIMENTS_WITH_KEYWORDS_OUTFILE_NAME}" + ) + with open(FILTERED_EXPERIMENTS_WITH_KEYWORDS_OUTFILE_NAME, "w") as fout: + yaml.dump(results, fout) + + +if __name__ == "__main__": + main() diff --git a/bin/get_geo_dataset_accessions.py b/bin/get_geo_dataset_accessions.py new file mode 100755 index 00000000..d20b3ef3 --- /dev/null +++ b/bin/get_geo_dataset_accessions.py @@ -0,0 +1,961 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import random +import tarfile +from functools import partial +from multiprocessing import Pool +from pathlib import Path +from urllib.request import urlretrieve + +import httpx +import pandas as pd +import xmltodict +from Bio import Entrez +from natural_language_utils import keywords_in_fields +from tenacity import ( + before_sleep_log, + retry, + stop_after_delay, + wait_exponential, +) +from tqdm import tqdm + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# set a custom writable directory before any Entrez operations +# mandatory for running the script in an apptainer container +# Entrez.Parser.Parser.directory("/tmp/biopython") + +ALLOWED_PLATFORMS = ["rnaseq", "microarray"] + +ACCESSION_OUTFILE_NAME = "accessions.txt" +SPECIES_DATASETS_OUTFILE_NAME = "geo_all_datasets.metadata.tsv" +REJECTED_DATASETS_OUTFILE_NAME = "geo_rejected_datasets.metadata.tsv" +# WRONG_SPECS_DATASETS_METADATA_OUTFILE_NAME = "geo_wrong_platform_moltype_datasets.metadata.tsv" +# WRONG_KEYWORDS_DATASETS_METADATA_OUTFILE_NAME = "geo_wrong_keywords_datasets.metadata.tsv" +# PLATFORM_NOT_AVAILABLE_DATASETS_METADATA_OUTFILE_NAME = "platform_not_available_datasets.metadata.tsv" +# GENE_ID_MAPPING_ISSUES_DATASETS_METADATA_OUTFILE_NAME = "gene_id_mapping_issues_datasets.metadata.tsv" +SELECTED_DATASETS_OUTFILE_NAME = "geo_selected_datasets.metadata.tsv" + +ENTREZ_QUERY_MAX_RESULTS = 9999 +ENTREZ_EMAIL = "stableexpression@nfcore.com" +PLATFORM_METADATA_CHUNKSIZE = 2000 + +NCBI_API_BASE_URL = ( + "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?view=data&acc={accession}" +) +STOP_RETRY_AFTER_DELAY = 600 + +NB_PROBE_IDS_TO_PARSE = 1000 +NB_PROBE_IDS_TO_SAMPLE = 10 + +SUPERSERIES_SUMMARY = "This SuperSeries is composed of the SubSeries listed below." + +ALLOWED_LIBRARY_SOURCES = ["transcriptomic", "RNA"] +ALLOWED_MOLECULE_TYPES = ["RNA", "SRA"] + +GEO_EXPERIMENT_TYPE_TO_PLATFORM = { + "Expression profiling by array": "microarray", + "Expression profiling by high throughput sequencing": "rnaseq", +} + +MINIML_TMPDIR = "geo_miniml" +PLATFORM_SOFT_TMPDIR = "geo_platform_soft" +Path(MINIML_TMPDIR).mkdir(exist_ok=True) +Path(PLATFORM_SOFT_TMPDIR).mkdir(exist_ok=True) + + +################################################################## +################################################################## +# EXCEPTIONS +################################################################## +################################################################## + + +class GeoDatasetNothingFoundError(Exception): + pass + + +class GeoPlatformDataTableNotFound(Exception): + pass + + +################################################################## +################################################################## +# FUNCTIONS +################################################################## +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Get GEO Datasets accessions") + parser.add_argument( + "--species", + type=str, + required=True, + help="Search GEO Datasets for this specific species", + ) + parser.add_argument( + "--keywords", + type=str, + nargs="*", + help="Keywords to search for in datasets description", + ) + parser.add_argument( + "--platform", type=str, help="Platform type", choices=ALLOWED_PLATFORMS + ) + parser.add_argument( + "--exclude-accessions-in", + dest="excluded_accessions_file", + type=Path, + help="Exclude accessions contained in this file", + ) + parser.add_argument( + "--random-sampling-size", + dest="random_sampling_size", + type=int, + help="Random sampling size", + ) + parser.add_argument( + "--random-sampling-seed", + dest="random_sampling_seed", + type=int, + help="Random sampling seed", + ) + parser.add_argument( + "--cpus", type=int, dest="nb_cpus", required=True, help="Number of CPUs" + ) + parser.add_argument( + "--accessions", + type=str, + help="[For dev purposes / testing: provide directly accessions (separated by commas) and try to get their metadata]", + ) + return parser.parse_args() + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# QUERIES TO ENTREZ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), + retry_error_callback=(lambda _: {}), +) +def send_request_to_entrez_esearch(query: str) -> dict: + Entrez.email = ENTREZ_EMAIL + with Entrez.esearch( + db="gds", term=query, retmax=ENTREZ_QUERY_MAX_RESULTS + ) as handle: + return Entrez.read(handle) + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), + retry_error_callback=(lambda _: []), +) +def send_request_to_entrez_esummary(ids: list[str]) -> list[dict]: + Entrez.email = ENTREZ_EMAIL + ids_str = ",".join(ids) + with Entrez.esummary( + db="gds", id=ids_str, retmax=ENTREZ_QUERY_MAX_RESULTS + ) as handle: + return Entrez.read(handle) + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), + retry_error_callback=(lambda _: None), +) +def send_request_to_ncbi_api(accession: str) -> httpx.Response | None: + url = NCBI_API_BASE_URL.format(accession=accession) + server_error = False + response = None + + try: + response = httpx.get(url) + except httpx.ConnectError: + server_error = True + else: + try: + response.raise_for_status() + except Exception as err: + if str(response.status_code).startswith("5"): # error 500 -> 509 + server_error = True + raise err + else: + logger.error( + f"Error {response.status_code} while sending request to NCBI: {err}" + ) + raise err + + # if we get connection issues or 500 -> 509 server errors + # we stop immediately for this accession (return None) + if server_error: + logger.critical( + f"Server error while sending request to NCBI for accession {accession}" + ) + + return response + + +@retry( + stop=stop_after_delay(STOP_RETRY_AFTER_DELAY), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), + retry_error_callback=(lambda _: None), +) +def download_file_at_url(url: str, output_file: Path): + urlretrieve(url, output_file) + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# GEO DATASETS +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def chunk_list(lst: list, chunksize: int) -> list: + """Splits a list into chunks of a given size. + + Args: + lst (list): The list to split. + chunksize (int): The size of each chunk. + + Returns: + list: A list of chunks, where each chunk is a list of len(chunksize). + """ + return [lst[i : i + chunksize] for i in range(0, len(lst), chunksize)] + + +def fetch_geo_datasets_for_species(species: str) -> list[dict]: + """ + Fetch GEO datasets (GSE series) for a given species + + Args: + species (str): Scientific name of the species (e.g. "Homo sapiens"). + """ + dataset_types = [ + f'"{experiment_type}"[DataSet Type]' + for experiment_type in GEO_EXPERIMENT_TYPE_TO_PLATFORM + ] + formatted_dataset_type = "(" + " OR ".join(dataset_types) + ")" + + query = f'"{species}"[Organism] AND "gse"[Entry Type] AND {formatted_dataset_type}' + logger.info(f"Fetching GEO datasets with query: {query}") + + # getting list of all datasets IDs for this species + # we need possibly to perform multiple queries because the max number of returned results is capped + nb_entries = None + retstart = 0 + record = {} + while not nb_entries or retstart < nb_entries: + record = send_request_to_entrez_esearch(query) + + if not record: + logger.warning(f"Failed to query Entrey Esearch with query: {query}") + return [] + + # getting total nb of entries + if not nb_entries: + nb_entries = int(record["Count"]) + + # if there is no entry for this species + if nb_entries == 0: + logger.info(f"No entries found for query: {query}") + return [] + + # setting next cursor to the next group + retstart += ENTREZ_QUERY_MAX_RESULTS + + ids = record.get("IdList", []) + if not ids: + logger.warning("No GEO datasets found for your query.") + return [] + + # fetching summary info + results = send_request_to_entrez_esummary(ids) + + # keeping only series datasets (just a double check here) + # and removing superseries (they are just containers of series that are also contained here) + return [ + r + for r in results + if "GSE" in r["Accession"] and r["summary"] != SUPERSERIES_SUMMARY + ] + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# FORMATTING +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def format_species(species: str) -> str: + return "_".join(species.lower().split(" ")) + + +def format_platform_name(platform_name: str) -> str: + return platform_name.replace("_", "").replace("-", "").lower() + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# GET METADATA +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def download_dataset_metadata(ftp_link: str, accession: str) -> Path | None: + filename = f"miniml/{accession}_family.xml.tgz" + ftp_url = ftp_link + filename + output_file = Path(MINIML_TMPDIR) / f"{accession}.tar.gz" + download_file_at_url(ftp_url, output_file) + if output_file.exists(): + return output_file + else: + logger.error(f"Failed to download dataset metadata for accession: {accession}") + return None + + +def parse_dataset_metadata(file: Path, accession: str) -> dict | None: + with tarfile.open(file, "r:gz") as tar: + file_to_read = f"{accession}_family.xml" + + try: + f = tar.extractfile(file_to_read) + except KeyError: + file_to_read = f"{accession}_family.xml/{accession}_family.xml" + try: + f = tar.extractfile(file_to_read) + except KeyError: + return None + + if f is None: + logger.warning(f"Failed to get file: {file_to_read}") + return None + + try: + xml_content = f.read().decode("utf-8") + except UnicodeDecodeError: + logger.warning(f"Failed to decode file: {file_to_read}") + return None + + return xmltodict.parse(xml_content)["MINiML"] + + +def parse_characteristics( + characteristics: str | dict | list, stored_characteristics: list +): + if isinstance(characteristics, str): + stored_characteristics.append(characteristics) + elif isinstance(characteristics, dict): + if "#text" in characteristics: + stored_characteristics.append(characteristics["#text"]) + elif isinstance(characteristics, list): + for c in characteristics: + parse_characteristics(c, stored_characteristics) + + +def parse_interesting_metadata( + dataset_metadata: dict, additional_metadata: dict +) -> dict: + """ + Parses interesting metadata from a dataset metadata dictionary and additional metadata dictionary. + + Args: + dataset_metadata (dict): The dataset metadata dictionary. + additional_metadata (dict): The additional metadata dictionary. + + Returns: + dict: The parsed interesting metadata dictionary. + """ + sample_characteristics = [] + sample_library_strategies = [] + sample_library_sources = [] + sample_descriptions = [] + sample_titles = [] + sample_molecule_types = [] + + platform_accessions = [ + "GPL" + gpl_id for gpl_id in dataset_metadata["GPL"].split(";") + ] + + experiment_types = dataset_metadata["gdsType"] + experiment_types = ( + experiment_types if isinstance(experiment_types, list) else [experiment_types] + ) + + # if additional metadata have sample information + if "Sample" in additional_metadata: + # change to list if it's a single dictionary + if isinstance(additional_metadata["Sample"], dict): + additional_metadata["Sample"] = [additional_metadata["Sample"]] + + for sample in additional_metadata["Sample"]: + # storing description if exists + if sample_description := sample.get("Description"): + sample_descriptions.append(sample_description) + + # storing title if exists + if sample_title := sample.get("Title"): + sample_titles.append(sample_title) + + # storing molecule type if exists + if sample_molecule_type := sample.get("Type"): + sample_molecule_types.append(sample_molecule_type) + + # storing library strategy if exists + if sample_library_strategy := sample.get("Library-Strategy"): + sample_library_strategies.append(sample_library_strategy) + + # storing library source if exists + if sample_library_source := sample.get("Library-Source"): + sample_library_sources.append(sample_library_source) + + # parsing sample metadata + if channels := sample.get("Channel"): + if isinstance(channels, dict): + channels = [channels] + for channel in channels: + parse_characteristics( + channel["Characteristics"], sample_characteristics + ) + + return { + "accession": dataset_metadata["Accession"], + "taxon": dataset_metadata["taxon"], + "platform_accessions": platform_accessions, + "summary": dataset_metadata["summary"], + "title": dataset_metadata["title"], + "overall_design": additional_metadata["Series"]["Overall-Design"], + "experiment_types": experiment_types, + "sample_characteristics": list(set(sample_characteristics)), + "sample_library_strategies": list(set(sample_library_strategies)), + "sample_library_sources": list(set(sample_library_sources)), + "sample_descriptions": list(set(sample_descriptions)), + "sample_titles": list(set(sample_titles)), + "sample_molecule_types": list(set(sample_molecule_types)), + } + + +def fetch_dataset_metadata(dataset_metadata: dict) -> dict | None: + """ + Parses metadata from a dataset metadata dictionary. + + Args: + dataset_metadata (dict): The dataset metadata dictionary. + + Returns: + dict | None: The parsed metadata dictionary or None if the metadata is missing. + """ + accession = dataset_metadata["Accession"] + ftp_link = dataset_metadata["FTPLink"].replace("ftp://", "https://") + downloaded_file = download_dataset_metadata(ftp_link, accession) + if downloaded_file is None: + logger.warning(f"Skipping {accession} as metadata download failed") + return None + + additional_metadata = parse_dataset_metadata(downloaded_file, accession) + + # if we could not get additional metadata, we lack too much information to conclude + if additional_metadata is None: + logger.warning(f"Skipping {accession} as additional metadata is missing") + return None + + # parsing interesting information in all available metadata + return parse_interesting_metadata(dataset_metadata, additional_metadata) + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# METADATA TESTS +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def exclude_unwanted_accessions( + datasets: list[dict], excluded_accessions: list[str] +) -> tuple[list[dict], list[dict]]: + datasets_to_keep = [] + excluded_datasets = [] + for dataset in datasets: + if dataset["accession"] in excluded_accessions: + excluded_datasets.append(dataset) + else: + datasets_to_keep.append(dataset) + return datasets_to_keep, excluded_datasets + + +def check_species_issues(parsed_species_list: list, species: str) -> str | None: + # trying to find our species in the list of species parsed + for parsed_species in parsed_species_list: + if format_species(parsed_species) == format_species(species): + return None + return f"PARSED SPECIES: {parsed_species_list}" + + +def check_molecule_type_issues(molecules_types: list) -> str | None: + # we want only GEO series that contain only RNA molecules + # for other series, they should be superseries contained other series that are being parsed too + # so anyway, this would lead in duplicates + if any( + [ + molecule_type.upper() in ALLOWED_MOLECULE_TYPES + for molecule_type in molecules_types + ] + ): + return None + return f"MOLECULE TYPES: {molecules_types}" + + +def check_experiment_type_issues(experiment_types: list, platform: str) -> str | None: + for experiment_type in experiment_types: + # if at least one experiment type is ok, we keep this dataset + if GEO_EXPERIMENT_TYPE_TO_PLATFORM.get(experiment_type) == platform: + return None + return f"EXPERIMENT TYPES: {experiment_types}" + + +def check_source_issues(library_sources: list) -> str | None: + # if we have no data about library sources, we just cannot infer + if not library_sources: + return None + if any( + library_source in ALLOWED_LIBRARY_SOURCES for library_source in library_sources + ): + return None + return f"LIBRARY SOURCES: {library_sources}" + + +def search_keywords(dataset: dict, keywords: list[str]) -> tuple[list, str | None]: + accession = dataset["accession"] + all_searchable_fields = ( + [dataset["summary"], dataset["title"]] + + dataset["sample_characteristics"] + + dataset["sample_descriptions"] + + dataset["sample_titles"] + ) + found_keywords = keywords_in_fields(all_searchable_fields, keywords) + # only returning experiments if found keywords + if found_keywords: + dataset["found_keywords"] = list(set(found_keywords)) + logger.info(f"Found keywords: {found_keywords} in accession {accession}") + return found_keywords, None + else: + return [], "NO KEYWORDS_FOUND" + + +def check_dataset( + dataset: dict, species: str, platform: str | None, keywords: list[str] | None +) -> tuple[list, dict]: + accession = dataset["accession"] + parsed_species_list = dataset["taxon"].split("; ") + experiment_types = dataset["experiment_types"] + library_sources = dataset["sample_library_sources"] + molecules_types = dataset["sample_molecule_types"] + + issues = [] + + # checking species + if issue := check_species_issues(parsed_species_list, species): + issues.append(issue) + + # checking platform + if platform is not None: + if issue := check_experiment_type_issues(experiment_types, platform): + issues.append(issue) + + # checking that library sources fit + if issue := check_source_issues(library_sources): + issues.append(issue) + + # checking that all molecule types are RNA + if issue := check_molecule_type_issues(molecules_types): + issues.append(issue) + + found_keywords = [] + if keywords: + found_keywords, keyword_issue = search_keywords(dataset, keywords) + if keyword_issue: + issues.append(keyword_issue) + + if issues: + rejection_dict = {accession: issues} + else: + rejection_dict = {} + + return found_keywords, rejection_dict + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# GEO PLATFORMS +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def fetch_geo_platform_metadata(datasets: list[dict]) -> dict: + """ + Fetch data for a GEO platform + + Args: + platform_accession (str): accession of the platform + """ + # unique list of platform accessions + platform_accessions = list( + set( + [ + platform_accession + for dataset in datasets + for platform_accession in dataset["platform_accessions"] + ] + ) + ) + # formating query + formatted_platform_accessions = [ + f'"{platform_accession}"[GEO Accession]' + for platform_accession in platform_accessions + ] + platform_accessions_str = " OR ".join(formatted_platform_accessions) + query = f'({platform_accessions_str}) AND "gpl"[Entry Type] ' + + record = send_request_to_entrez_esearch(query=query) + + ids = record.get("IdList", []) + if not ids: + logger.warning(f"No GEO platform found for accessions {platform_accessions}.") + return {} + + # fetching summary info + # one single request to NCBI for all platform accessions + platform_metadatas = send_request_to_entrez_esummary(ids) + # return dict associating dataset accessions with platform metadata + return { + platform_metadata["Accession"]: platform_metadata + for platform_metadata in platform_metadatas + } + + +def check_dataset_platforms( + dataset: dict, accession_to_platform_metadata: dict, species: str +) -> dict: + accession = dataset["accession"] + platform_accessions = dataset["platform_accessions"] + + if not platform_accessions: + return {accession: "NO PLATFORM ACCESSIONS"} + + platforms_metadata = [ + accession_to_platform_metadata[platform_accession] + for platform_accession in dataset["platform_accessions"] + ] + + # getting list of platform taxon + platforms_taxons = [] + for metadata in platforms_metadata: + if metadata.get("taxon") is not None: + platforms_taxons += metadata.get("taxon").split("; ") + platforms_taxons = list(set(platforms_taxons)) + + if not platforms_taxons: + return {accession: "NO PLATFORM TAXON"} + + # checking if at least one of the platform accession is the good one + # sample will be further filtered during download (download_geo_data.R) + if not any( + format_species(species) == format_species(taxon) for taxon in platforms_taxons + ): + return {accession: f"TAXON MISMATCH: {platforms_taxons}"} + + return {} + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# RANDOM SAMPLING +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def sample_experiments_randomly( + experiments: list[dict], sampling_size: int, seed: int +) -> list[str]: + random.seed(seed) + sampled_experiments = [] + + total_nb_samples = 0 + experiments_left = list(experiments) + while experiments_left and total_nb_samples <= sampling_size: + # if the min number of samples is greater than the remaining space left, we get out of the loop + experiments_left_nb_samples = [exp["nb_samples"] for exp in experiments_left] + min_nb_samples = min(experiments_left_nb_samples) + if min_nb_samples > sampling_size - total_nb_samples: + break + + found_experiment = False + test_total_nb_samples = int(total_nb_samples) + not_chosen_yet = list(experiments_left) + while not_chosen_yet and not found_experiment: + experiment = random.choice(not_chosen_yet) + not_chosen_yet.remove(experiment) + test_total_nb_samples = total_nb_samples + experiment["nb_samples"] + if test_total_nb_samples <= sampling_size: + found_experiment = True + + # if the last one was not good, it means we reached the limit of samples we can take + if not found_experiment: + break + else: + total_nb_samples = test_total_nb_samples + experiments_left.remove(experiment) + sampled_experiments.append(experiment) + + return [exp["accession"] for exp in sampled_experiments] + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# EXPORT +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +def sort_if_list(x): + if isinstance(x, list): + return sorted(x) + else: + return x + + +def export_dataset_metadatas( + datasets: list[dict], output_file: str, clean_columns: bool = True +): + if datasets: + df = pd.DataFrame.from_dict(datasets) + # all dataframe contain the column "accession" + # sorting by accessions to ensure that outputs are reproducible + df.sort_values(by="accession", inplace=True) + for col in df.columns: + df[col] = df[col].apply(sort_if_list) + # cleaning columns so that MultiQC can parse them + if clean_columns: + for col in df.columns: + df[col] = df[col].astype(str).str.replace("\n", "") + df[col] = df[col].astype(str).str.replace("\t", "") + df.to_csv( + output_file, + sep="\t", + index=False, + header=True, + ) + + +################################################################## +################################################################## +# MAIN +################################################################## +################################################################## + + +def main(): + args = parse_args() + + random_sampling_size = args.random_sampling_size + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # PARSING GEO DATASETS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + logger.info(f"Getting datasets corresponding to species {args.species}") + datasets = fetch_geo_datasets_for_species(args.species) + logger.info(f"Found {len(datasets)} datasets for species {args.species}") + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # FOR DEV PURPOSES / TESTING: RESTRICT TO SPECIFIC ACCESSIONS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + if args.accessions: + logger.info(f"Keeping only accessions {args.accessions}") + dev_accessions = args.accessions.split(",") + datasets = [d for d in datasets if d["Accession"] in dev_accessions] + logger.info(f"Kept {len(datasets)} datasets for dev / testing purposes") + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # PARSING DATASET METADATA + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + logger.info(f"Parsing metadata for {len(datasets)} datasets") + augmented_datasets = [] + with ( + Pool(processes=args.nb_cpus) as p, + tqdm(total=len(datasets)) as pbar, + ): + for result in p.imap_unordered(fetch_dataset_metadata, datasets): + pbar.update() + pbar.refresh() + if result is None: + continue + augmented_datasets.append(result) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # VALIDATING DATASETS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + logger.info(f"Validating {len(augmented_datasets)} datasets") + checked_datasets = [] + rejection_dict = {} + for dataset in tqdm(augmented_datasets): + found_keywords, issue_dict = check_dataset( + dataset, args.species, args.platform, args.keywords + ) + if issue_dict: + rejection_dict |= issue_dict + else: + if found_keywords: + dataset["found_keywords"] = found_keywords + checked_datasets.append(dataset) + + logger.info(f"Validated {len(checked_datasets)} datasets") + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # EXCLUDING UNWANTED ACCESSIONS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + # we exclude unwanted accessions only now + # because we want to get the metadata of the excluded datasets + # in order to adjust the random sampling size + if args.excluded_accessions_file: + # parsing list of accessions which were already fetched from Expression Atlas + with open(args.excluded_accessions_file) as fin: + excluded_accessions = fin.read().splitlines() + logger.info("Excluding unwanted datasets") + checked_datasets, excluded_datasets = exclude_unwanted_accessions( + checked_datasets, excluded_accessions + ) + logger.info( + f"{len(checked_datasets)} datasets remaining after excluding unwanted accessions" + ) + + # adjusting random sampling size by substracting the number of excluded accessions + if random_sampling_size: + total_nb_excluded_samples = sum( + [len(dataset["sample_titles"]) for dataset in excluded_datasets] + ) + logger.info( + f"Subtracting {total_nb_excluded_samples} samples from random sampling size" + ) + random_sampling_size -= total_nb_excluded_samples + # keeping it positive (just in case) + if random_sampling_size < 0: + logger.warning( + f"Random sampling size is negative ({random_sampling_size}), setting it to 0" + ) + random_sampling_size = 0 + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # GETTING METADATA OF SEQUENCING PLATFORMS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + logger.info("Getting platform metadata") + # making chunks to group httpx to NCBI GEO + checked_datasets_chunks = chunk_list(checked_datasets, PLATFORM_METADATA_CHUNKSIZE) + # resetting selecting datasets + accession_to_platform_metadata = {} + for selected_datasets_chunk in tqdm(checked_datasets_chunks): + accession_to_platform_metadata |= fetch_geo_platform_metadata( + selected_datasets_chunk + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # VALIDATING EACH PLATFORM SEPARATELY, DATASET BY DATASET + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + logger.info(f"Checking each platform for {len(checked_datasets)} datasets") + func = partial( + check_dataset_platforms, + accession_to_platform_metadata=accession_to_platform_metadata, + species=args.species, + ) + selected_datasets = [] + # resetting selecting datasets + for dataset in tqdm(checked_datasets): + accession = dataset["accession"] + issue_dict = func(dataset) + if issue_dict: + if accession in rejection_dict: # should not happen but in case + rejection_dict[accession] += issue_dict[accession] + else: + rejection_dict |= issue_dict + else: + selected_datasets.append(dataset) + + if rejection_dict: + logger.warning(f"{len(rejection_dict)} datasets rejected") + logger.warning(f"Reasons for rejection: {rejection_dict}") + + selected_accessions = sorted( + [dataset["accession"] for dataset in selected_datasets] + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # RANDOM SAMPLING + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + if random_sampling_size is not None and args.random_sampling_seed is not None: + selected_accession_to_nb_samples = [ + { + "accession": dataset["accession"], + "nb_samples": len(dataset["sample_titles"]), + } + for dataset in selected_datasets + ] + + nb_samples_df = pd.DataFrame.from_dict(selected_accession_to_nb_samples) + nb_samples_df.to_csv("selected_accession_to_nb_samples.csv", index=False) + + logger.info("Sampling experiments randomly") + selected_accessions = sample_experiments_randomly( + selected_accession_to_nb_samples, + random_sampling_size, + args.random_sampling_seed, + ) + logger.info( + f"Kept {len(selected_accessions)} experiments after random sampling" + ) + selected_datasets = [ + dataset + for dataset in selected_datasets + if dataset["accession"] in selected_accessions + ] + else: + logger.info( + f"No random sampling requested. Kept {len(selected_datasets)} datasets" + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # EXPORTING ACCESSIONS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + # sorting accessions to ensure that outputs are reproducible + selected_accessions = sorted(selected_accessions) + with open(ACCESSION_OUTFILE_NAME, "w") as fout: + fout.write("\n".join(selected_accessions)) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # EXPORTING DATASETS + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + export_dataset_metadatas(augmented_datasets, SPECIES_DATASETS_OUTFILE_NAME) + export_dataset_metadatas(selected_datasets, SELECTED_DATASETS_OUTFILE_NAME) + + rejected_datasets = [ + {"accession": accession, "reason": reason} + for accession, reason in rejection_dict.items() + ] + export_dataset_metadatas( + rejected_datasets, REJECTED_DATASETS_OUTFILE_NAME, clean_columns=False + ) + + +if __name__ == "__main__": + main() diff --git a/bin/get_ratio_standard_variation.py b/bin/get_ratio_standard_variation.py new file mode 100755 index 00000000..76d4ebfd --- /dev/null +++ b/bin/get_ratio_standard_variation.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# experimentally chosen +RATIO_CHUNK_SIZE = 100 + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compute M-measure for each gene") + parser.add_argument( + "--file", + type=Path, + dest="ratio_file", + required=True, + help="File log of pairwise expression ratios", + ) + parser.add_argument( + "--task-attempts", + dest="task_attempts", + type=int, + default=1, + help="Number of task attempts", + ) + return parser.parse_args() + + +def get_nb_rows(lf: pl.LazyFrame): + return lf.select(pl.len()).collect().item() + + +def get_count_columns(lf: pl.LazyFrame) -> list[str]: + """Get all column names except the config.GENE_ID_COLNAME column. + + The config.GENE_ID_COLNAME column contains only gene IDs. + """ + return [ + col + for col in lf.collect_schema().names() + if not col.startswith(config.GENE_ID_COLNAME) + ] + + +def compute_standard_deviations(file: Path, low_memory: bool) -> pl.LazyFrame: + ratios_lf = pl.scan_parquet(file, low_memory=low_memory) + ratio_columns = [ + col for col in ratios_lf.collect_schema().names() if col.endswith("_log_ratio") + ] + concat_ratios_lf = ratios_lf.select( + [ + pl.concat_list( + [pl.col(col) for col in ratio_columns[i : i + RATIO_CHUNK_SIZE]] + ).alias(f"concat_list_chunk_{i // RATIO_CHUNK_SIZE}") + for i in range(0, len(ratio_columns), RATIO_CHUNK_SIZE) + ] + ).select(pl.concat_list(pl.all()).alias("ratios")) + return pl.concat( + [ + concat_ratios_lf.select("ratios"), + ratios_lf.select(pl.exclude("^.*_log_ratio$")), # gene_id & gene_id_other + ], + how="horizontal", + ).select( + pl.col("ratios").list.std(ddof=0).alias(config.RATIOS_STD_COLNAME), + pl.col(config.GENE_ID_COLNAME), + pl.col(f"{config.GENE_ID_COLNAME}_other"), + ) + + +def get_column_standard_deviations(std_lf: pl.LazyFrame, column: str) -> pl.LazyFrame: + # column is either config.GENE_ID_COLNAME or f"{config.GENE_ID_COLNAME}_other" + return ( + std_lf.group_by(column) + .agg(config.RATIOS_STD_COLNAME) # getting list of ratio std for this gene + .select( + pl.col(column).alias(config.GENE_ID_COLNAME), + pl.col(config.RATIOS_STD_COLNAME), + ) + ) + + +def group_standard_deviations(std_lf: pl.LazyFrame) -> pl.LazyFrame: + # getting the standard devs for genes in the gene_id column + std_a = get_column_standard_deviations(std_lf, column=config.GENE_ID_COLNAME) + # getting the standard devs for genes in the gene_id_other column + std_b = get_column_standard_deviations( + std_lf, column=f"{config.GENE_ID_COLNAME}_other" + ) + # concatenating both dataframes vertically + # if both lists of gene ids are the identical, + # we need to collect values only for one column to avoid duplicates + return ( + pl.concat([std_a, std_b], how="vertical") + .unique(subset=config.GENE_ID_COLNAME) + .sort( + config.GENE_ID_COLNAME + ) # only needed to have consistent output (for snapshots) + ) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + low_memory = True if args.task_attempts > 1 else False + std_lf = compute_standard_deviations(args.ratio_file, low_memory) + std_lf = group_standard_deviations(std_lf) + + # when the ratio file corresponds to the same gene ids cross joined with themselves (i == i) + # then we want only only one row per gene id + + std_df = std_lf.collect() + if len(std_df) == 0: + raise ValueError( + f"No output following treatment of file {str(args.ratio_file)}" + ) + + outfile = args.ratio_file.name.replace("ratios", "std") + std_df.write_parquet(outfile) + + +if __name__ == "__main__": + main() diff --git a/bin/gprofiler_map_ids.py b/bin/gprofiler_map_ids.py new file mode 100755 index 00000000..4a559bde --- /dev/null +++ b/bin/gprofiler_map_ids.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from pathlib import Path + +import config +import pandas as pd +from gprofiler_utils import convert_ids + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +################################################################## +# CONSTANTS +################################################################## + +MAPPED_GENE_IDS_OUTFILE = "mapped_gene_ids.csv" +METADATA_OUTFILE = "gene_metadata.csv" + +TARGET_DATABASE_CHOICES = ["ENTREZGENE", "ENSG"] + +FAILURE_REASON_FILE = "failure_reason.txt" + +################################################################## +# FUNCTIONS +################################################################## + + +def parse_args(): + parser = argparse.ArgumentParser("Map IDs using g:Profiler") + parser.add_argument( + "--gene-ids", + type=Path, + dest="gene_id_file", + required=True, + help="Input file containing gene IDs", + ) + parser.add_argument( + "--species", type=str, required=True, help="Species to convert IDs for" + ) + parser.add_argument( + "--target-db", + type=str, + dest="gprofiler_target_db", + required=True, + choices=TARGET_DATABASE_CHOICES, + help="Target database to convert IDs to", + ) + return parser.parse_args() + + +################################################################## +# MAIN +################################################################## + + +def main(): + args = parse_args() + + with open(args.gene_id_file, "r") as fin: + gene_ids = list(set([line.strip() for line in fin])) + + logger.info(f"Converting {len(gene_ids)} IDs for species {args.species} ") + + ############################################################# + # QUERYING g:PROFILER SERVER + ############################################################# + + gene_metadata_dfs = [] + + mapping_dict, gene_metadata_dfs = convert_ids( + gene_ids, args.species, args.gprofiler_target_db + ) + + if not mapping_dict: + msg = ( + f"No mapping found for gene IDs such as {' '.join(gene_ids[:5])} on species {args.species} " + + f"and g:Profiler target database {args.gprofiler_target_db}" + ) + logger.error(msg) + with open(FAILURE_REASON_FILE, "w") as fout: + fout.write(msg) + sys.exit(100) + + ############################################################# + # WRITING MAPPING + ############################################################# + + # making dataframe for mapping (only two columns: original and new) + mapping_df = ( + pd.DataFrame(mapping_dict, index=[0]) + .T.reset_index() # transpose: setting keys as indexes instead of columns + .rename( + columns={ + "index": config.ORIGINAL_GENE_ID_COLNAME, + 0: config.GENE_ID_COLNAME, + } + ) + .sort_values(by=config.ORIGINAL_GENE_ID_COLNAME) + ) + mapping_df.to_csv(MAPPED_GENE_IDS_OUTFILE, index=False, header=True) + + ############################################################# + # WRITING METADATA + ############################################################# + + gene_metadata_df = pd.concat(gene_metadata_dfs, ignore_index=True) + # dropping duplicates and keeping the first occurence + gene_metadata_df.drop_duplicates( + subset=[config.GENE_ID_COLNAME], keep="first" + ).sort_values(by=config.GENE_ID_COLNAME).to_csv( + METADATA_OUTFILE, index=False, header=True + ) + + +if __name__ == "__main__": + main() diff --git a/bin/gprofiler_utils.py b/bin/gprofiler_utils.py new file mode 100755 index 00000000..831b94b5 --- /dev/null +++ b/bin/gprofiler_utils.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import logging + +import config +import httpx +import pandas as pd +from tenacity import ( + before_sleep_log, + retry, + stop_after_delay, + wait_exponential, +) + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +################################################################## +# CONSTANTS +################################################################## + +GPROFILER_CONVERT_API_ENDPOINT = "https://biit.cs.ut.ee/gprofiler/api/convert/convert/" +GPROFILER_CONVERT_BETA_API_ENDPOINT = ( + "https://biit.cs.ut.ee/gprofiler_beta/api/convert/convert/" +) + +CHUNKSIZE = 2000 # number of IDs to convert at a time - may create trouble if > 2000 + +COLS_TO_KEEP = ["incoming", "converted", "name", "description"] +DESCRIPTION_PART_TO_REMOVE_REGEX = r"\s*\[Source:.*?\]" + +GPROFILER_ERROR_MESSAGE = ( + "g:Profiler servers (main and beta) seem to be down... Please retry later... " + "If you have gene ID mappings and / or gene metadata for these datasets, you can provide them " + "directly using the `--gene_id_mapping` and `--gene_metadata` parameters respectively, " + "and by skipping the g:Profiler ID mapping step with `--skip_id_mapping`." +) + + +################################################################## +# FUNCTIONS +################################################################## + + +class GProfilerConnectionError(Exception): + pass + + +def format_species_name(species: str): + """ + Format a species name into a format accepted by g:Profiler. + Example: Arabidopsis thaliana -> athaliana + + Parameters + ---------- + species : str + The species name. + + Returns + ------- + str + The formatted species name. + """ + splitted_species = species.lower().replace("_", " ").split(" ") + return splitted_species[0][0] + splitted_species[1] + + +@retry( + stop=stop_after_delay(600), + wait=wait_exponential(multiplier=1, min=1, max=30), + before_sleep=before_sleep_log(logger, logging.WARNING), +) +def request_conversion( + gene_ids: list, + species: str, + target_database: str, + url: str = GPROFILER_CONVERT_API_ENDPOINT, + attempts: int = 0, +) -> list[str]: + """ + Send a request to the g:Profiler API to convert a list of gene IDs. + + Parameters + ---------- + gene_ids : list + The list of gene IDs to convert. + species : str + The species to convert the IDs for. + url : str, optionalrequest_conversion + The URL to send the request to, by default GPROFILER_CONVERT_API_ENDPOINT + attempts : int, optional + The number of attempts already performed, by default 0 + + Returns + ------- + list + The list of dicts corresponding to the converted IDs. + """ + + # formatting species for g:Profiler + organism = format_species_name(species) + + if attempts > 0: + logger.warning( + "g:Profiler main server appears down, trying with the beta server..." + ) + + server_appears_down = False + + try: + response = httpx.post( + url=url, + json={"organism": organism, "query": gene_ids, "target": target_database}, + ) + except httpx.ConnectError: + server_appears_down = True + else: + try: + response.raise_for_status() + except Exception as err: + if str(response.status_code).startswith("5"): # error 500 -> 509 + server_appears_down = True + else: + logger.error( + f"Error {response.status_code} while converting IDs: {err}" + ) + raise err + + if server_appears_down: + if attempts == 0: + logger.warning( + "g:Profiler main server appears down, trying with the beta server..." + ) + return request_conversion( + gene_ids, + species, + target_database=target_database, + url=GPROFILER_CONVERT_BETA_API_ENDPOINT, # backup endpoint + attempts=1, + ) + else: + # both servers appear down, we stop here... + logger.error(GPROFILER_ERROR_MESSAGE) + raise GProfilerConnectionError(GPROFILER_ERROR_MESSAGE) + + else: + return response.json()["result"] + + +def convert_chunk_of_ids( + gene_ids: list, species: str, gprofiler_target_db: str +) -> tuple[dict, pd.DataFrame]: + """ + Wrapper function that converts a list of gene IDs to another namespace. + + Parameters + ---------- + species : str + The species to convert the IDs for. + gene_ids : list + The IDs to convert. + target_database : str + The target database to convert to. + + Returns + ------- + dict + A dictionary where the keys are the original IDs and the values are the converted IDs. + """ + + results = request_conversion(gene_ids, species, gprofiler_target_db) + df = pd.DataFrame.from_records(results) + + if df.empty: + return {}, pd.DataFrame() + + # keeping only rows where 'converted' is not null and only the columns of interest + df = df.loc[df["converted"] != "None", COLS_TO_KEEP] + + # dict associating incoming IDs to converted IDs + mapping_dict = df.set_index("incoming").to_dict()["converted"] + + # DataFrame associating converted IDs to name and description + meta_df = df.drop(columns=["incoming"]).rename( + columns={"converted": config.GENE_ID_COLNAME} + ) + + meta_df["name"] = meta_df["name"].str.replace(",", ";") + + # Extract the part before '[Source:...]', or the whole string if not found + meta_df["description"] = ( + meta_df["description"] + .str.replace(DESCRIPTION_PART_TO_REMOVE_REGEX, "", regex=True) + .str.replace(",", ";") + ) + + return mapping_dict, meta_df + + +def chunk_list(lst: list, chunksize: int) -> list: + """Splits a list into chunks of a given size. + + Args: + lst (list): The list to split. + chunksize (int): The size of each chunk. + + Returns: + list: A list of chunks, where each chunk is a list of len(chunksize). + """ + return [lst[i : i + chunksize] for i in range(0, len(lst), chunksize)] + + +def convert_ids( + ids: list[str], species: str, gprofiler_target_db: str +) -> tuple[dict, pd.DataFrame]: + mapping_dict = {} + gene_metadata_dfs = [] + + chunks = chunk_list(ids, chunksize=CHUNKSIZE) + for chunk_gene_ids in chunks: + # converting to Gene IDs for all IDs comprised in this chunk + gene_mapping, meta_df = convert_chunk_of_ids( + chunk_gene_ids, species, gprofiler_target_db + ) + mapping_dict.update(gene_mapping) + gene_metadata_dfs.append(meta_df) + + return mapping_dict, gene_metadata_dfs diff --git a/bin/impute_missing_values.py b/bin/impute_missing_values.py new file mode 100755 index 00000000..962e9728 --- /dev/null +++ b/bin/impute_missing_values.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +from common import export_parquet, parse_count_table +from sklearn.experimental import enable_iterative_imputer # noqa +from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +OUTFILE_SUFFIX = ".imputed.parquet" + +THRESHOLD_RATIO_ZEROS = 0.9 + +# KNN +N_NEIGHBORS = 10 + +# ITERATIVE +MAX_ITERATIONS = 10 +N_NEAREST_FEATURES = 100 + +IMPUTERS = ["knn", "iterative", "gene_mean"] + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Perform KNN imputation on count data") + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument("--imputer", choices=IMPUTERS, required=True, dest="imputer") + return parser.parse_args() + + +def get_count_columns(df: pl.DataFrame): + return df.select(pl.exclude(config.GENE_ID_COLNAME)).columns + + +def apply_imputer(df: pl.DataFrame, imputer): + # convert to numpy, impute, then convert back + count_matrix = df.select(get_count_columns(df)).to_numpy() + imputed_array = imputer.fit_transform(count_matrix) + return df.with_columns(pl.DataFrame(imputed_array, schema=get_count_columns(df))) + + +def apply_simle_imputer(df: pl.DataFrame): + imputer = SimpleImputer() + return apply_imputer(df, imputer) + + +def apply_knn_imputer(df: pl.DataFrame) -> pl.DataFrame: + imputer = KNNImputer(n_neighbors=N_NEIGHBORS, weights="distance") + return apply_imputer(df, imputer) + + +def apply_iterative_imputer(df: pl.DataFrame) -> pl.DataFrame: + imputer = IterativeImputer( + max_iter=MAX_ITERATIONS, + n_nearest_features=N_NEAREST_FEATURES, + random_state=0, + initial_strategy="mean", + min_value=0, + max_value=1, + imputation_order="random", + verbose=1, + ) + return apply_imputer(df, imputer) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + logger.info(f"Parsing {args.count_file.name}") + df = parse_count_table(args.count_file) + + # logger.info("Separating genes with high number of zeros") + # df, high_zero_genes_df = separate_genes_with_high_number_of_zeros(count_df) + + if args.imputer == "iterative": + logger.info("Applying iterative imputation") + df = apply_iterative_imputer(df) + elif args.imputer == "knn": + logger.info("Applying KNN imputation") + df = apply_knn_imputer(df) + elif args.imputer == "gene_mean": + logger.info("Applying simple imputation") + df = apply_simle_imputer(df) + + export_parquet(df, args.count_file, OUTFILE_SUFFIX) + + logger.info("Done") + + +if __name__ == "__main__": + main() diff --git a/bin/make_cross_join.py b/bin/make_cross_join.py new file mode 100755 index 00000000..f28a597c --- /dev/null +++ b/bin/make_cross_join.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compute M-measure for each gene") + parser.add_argument( + "--file1", + type=Path, + dest="count_file_1", + required=True, + help="Chunk count file 1", + ) + parser.add_argument( + "--file2", + type=Path, + dest="count_file_2", + required=True, + help="Chunk count file 2", + ) + parser.add_argument( + "--index1", + type=Path, + dest="count_file_1_index", + required=True, + help="Index of chunk count file 1", + ) + parser.add_argument( + "--index2", + type=Path, + dest="count_file_2_index", + required=True, + help="Index of chunk count file 2", + ) + parser.add_argument( + "--task-attempts", + dest="task_attempts", + type=int, + default=1, + help="Number of task attempts", + ) + return parser.parse_args() + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + low_memory = True if args.task_attempts > 1 else False + lf = pl.scan_parquet(args.count_file_1, low_memory=low_memory) + lf_other = pl.scan_parquet(args.count_file_2, low_memory=low_memory) + + logger.info("Computing cross join data") + lf = lf.join( + lf_other, how="cross", suffix="_other" + ) # Perform a cross join with itself + + df = lf.collect() + if len(df) == 0: + raise ValueError( + f"No output following treatment of files {str(args.count_file_1)} and {str(args.count_file_2)}" + ) + + outfile = f"cross_join.{args.count_file_1_index}.{args.count_file_2_index}.parquet" + df.write_parquet(outfile) + + +if __name__ == "__main__": + main() diff --git a/bin/make_pairwise_gene_expression_ratio.py b/bin/make_pairwise_gene_expression_ratio.py new file mode 100755 index 00000000..0fdc5715 --- /dev/null +++ b/bin/make_pairwise_gene_expression_ratio.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compute M-measure for each gene") + parser.add_argument( + "--file", + type=Path, + dest="cross_joined_file", + required=True, + help="File where each row contains counts for two genes", + ) + parser.add_argument( + "--task-attempts", + dest="task_attempts", + type=int, + default=1, + help="Number of task attempts", + ) + return parser.parse_args() + + +def get_count_columns(lf: pl.LazyFrame) -> list[str]: + """Get all column names except the config.GENE_ID_COLNAME column. + + The config.GENE_ID_COLNAME column contains only gene IDs. + """ + return [ + col + for col in lf.collect_schema().names() + if not col.startswith(config.GENE_ID_COLNAME) + ] + + +def compute_ratios(file: Path, low_memory: bool) -> pl.LazyFrame: + # getting ratios for each sample + cross_join_lf = pl.scan_parquet(file, low_memory=low_memory) + column_pairs = { + col: f"{col}_other" + for col in get_count_columns(cross_join_lf) + if not col.endswith("_other") + } + return cross_join_lf.select( + [pl.col(config.GENE_ID_COLNAME), pl.col(f"{config.GENE_ID_COLNAME}_other")] + + [ + (pl.col(col) / pl.col(other_col)).log(base=2).alias(f"{col}_log_ratio") + for col, other_col in column_pairs.items() + ] + ) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + low_memory = True if args.task_attempts > 1 else False + ratios_lf = compute_ratios(args.cross_joined_file, low_memory) + + ratios_df = ratios_lf.collect() + + if len(ratios_df) == 0: + raise ValueError( + f"No output following treatment of file {str(args.cross_joined_file)}" + ) + + outfilename = args.cross_joined_file.name.replace("cross_join", "ratios") + ratios_df.write_parquet(outfilename) + + +if __name__ == "__main__": + main() diff --git a/bin/make_parquet_chunks.py b/bin/make_parquet_chunks.py new file mode 100755 index 00000000..59b8df3a --- /dev/null +++ b/bin/make_parquet_chunks.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from math import ceil +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# experimentally chosen +GENE_CHUNK_SIZE = 100 +ZERO_REPLACE_VALUE = 1e-8 + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Compute M-measure for each gene") + parser.add_argument( + "--counts", + type=Path, + dest="count_file", + required=True, + help="File containing normalised counts for all genes and all samples", + ) + parser.add_argument( + "--task-attempts", + dest="task_attempts", + type=int, + default=1, + help="Number of task attempts", + ) + return parser.parse_args() + + +def get_nb_rows(lf: pl.LazyFrame): + return lf.select(pl.len()).collect().item() + + +def parse_count_dataset(file: Path, low_memory: bool) -> pl.LazyFrame: + lf = pl.scan_parquet(file, low_memory=low_memory).fill_null(0).fill_nan(0) + count_columns = get_count_columns(lf) + cols = [pl.col(config.GENE_ID_COLNAME)] + [ + pl.col(column).replace({0: ZERO_REPLACE_VALUE}).cast(pl.Float64) + for column in count_columns + ] + return lf.select(cols) + + +def get_count_columns(lf: pl.LazyFrame) -> list[str]: + """Get all column names except the config.GENE_ID_COLNAME column. + + The config.GENE_ID_COLNAME column contains only gene IDs. + """ + return [ + col + for col in lf.collect_schema().names() + if not col.startswith(config.GENE_ID_COLNAME) + ] + + +def split_count_summary_in_chunks(lf: pl.LazyFrame): + lf = lf.with_row_index(name="index") + + nb_rows = get_nb_rows(lf) + logger.info(f"Number of rows (genes) in count file: {nb_rows}") + nb_chunks = ceil(nb_rows / GENE_CHUNK_SIZE) + logger.info(f"Number of chunks: {nb_chunks}") + + for i, start in enumerate(range(0, nb_rows, GENE_CHUNK_SIZE)): + partition = ( + lf.filter( + (pl.col("index") >= start) & (pl.col("index") < start + GENE_CHUNK_SIZE) + ) + .drop("index") + .collect() + ) + outfile = f"count_chunk.{i}.parquet" + partition.write_parquet(outfile) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + low_memory = True if args.task_attempts > 1 else False + logger.info("Parsing count file") + lf = parse_count_dataset(args.count_file, low_memory) + + logger.info("Splitting count file into chunks") + split_count_summary_in_chunks(lf) + + +if __name__ == "__main__": + main() diff --git a/bin/merge_counts.py b/bin/merge_counts.py new file mode 100755 index 00000000..0f9d426c --- /dev/null +++ b/bin/merge_counts.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import hashlib +import json +import logging +from operator import attrgetter +from pathlib import Path + +import config +import polars as pl + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +ALL_COUNTS_PARQUET_OUTFILENAME = "all_counts.parquet" + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser(description="Merge count datasets") + parser.add_argument( + "--counts", type=str, dest="count_files", required=True, help="Count files" + ) + return parser.parse_args() + + +##################################################### +# COUNTS +##################################################### + + +def get_lazyframes(files: list[Path]) -> list[pl.LazyFrame]: + """Get a list of LazyFrames from a list of files.""" + return [pl.scan_parquet(file, low_memory=True) for file in files] + + +def get_columns(lf: pl.LazyFrame) -> list[str]: + return lf.collect_schema().names() + + +def get_count_columns(lf: pl.LazyFrame) -> list[str]: + return [col for col in get_columns(lf) if col != config.GENE_ID_COLNAME] + + +def reproducible_hash(lf: pl.LazyFrame) -> str: + """ + Return a deterministic MD5 hash for a lazyframe. + + Steps: + 1. Convert the tuple (and any nested structures) to a canonical JSON string. + - `sort_keys=True` guarantees that dictionaries are ordered consistently. + - `separators=(',', ':')` removes unnecessary whitespace. + 2. Encode the string as UTF‑8 bytes. + 3. Feed the bytes to hashlib.md5 and return the hex digest. + + The result is a 64‑character hexadecimal string that will be identical + across Python runs, machines, and even different Python versions + (provided the data types are JSON‑compatible). + """ + tpl = tuple(get_columns(lf)) + # Canonical JSON representation + canonical_str = json.dumps(tpl, sort_keys=True, separators=(",", ":")) + # Encode to bytes + data_bytes = canonical_str.encode("utf-8") + # Compute MD5 + hash_obj = hashlib.md5(data_bytes) + return hash_obj.hexdigest() + + +def scan_counts(files: list[Path]) -> list[pl.LazyFrame]: + """ + Get all count data from a list of files. + """ + logger.info("Parsing counts") + # sorting them by file name to ensure consistent order between runs + files.sort(key=attrgetter("name")) + + lfs = get_lazyframes(files) + + # sorting dataframes by a hash on column names + # this is crucial for consistent output of the script + # in case multiple files have the same name + return sorted(lfs, key=lambda lf: reproducible_hash(lf)) + + +def collect_all_gene_ids(lfs: list[pl.LazyFrame]) -> pl.DataFrame: + """ + Collect all gene IDs from a list of lazyframes. + """ + logger.info("Getting the full list of gene IDs") + gene_id_set = set() + for lf in lfs: + lf_gene_ids = lf.select(config.GENE_ID_COLNAME).collect().to_series().to_list() + gene_id_set.update(lf_gene_ids) + return pl.DataFrame({config.GENE_ID_COLNAME: sorted(list(gene_id_set))}) + + +def make_tmp_sorted_dataframes( + lfs: list[pl.LazyFrame], gene_id_df: pl.DataFrame +) -> list[Path]: + """ """ + tmp_files = [] + for i, lf in enumerate(lfs): + # perform left join from gene ids so that all dataframes can be compared row-wise + # removing the gene id column for now + df = gene_id_df.join( + lf.collect(), on=config.GENE_ID_COLNAME, how="left" + ).select(pl.exclude(config.GENE_ID_COLNAME)) + outfile = Path(f"tmp.{i}.parquet") + df.write_parquet(outfile) + tmp_files.append(outfile) + return tmp_files + + +def formating_counts(lf: pl.LazyFrame): + """ + The config.GENE_ID_COLNAME column is cast + to String, and all other columns are cast to Float64. + """ + + # casting count columns to Float64 + # casting gene id column to Stringcount_files + # casting nans to nulls + logger.info("Cleaning merged lazyframe") + return lf.select( + [pl.col(config.GENE_ID_COLNAME).cast(pl.String)] + + [pl.col(column).cast(pl.Float64) for column in get_count_columns(lf)] + ).fill_nan(None) + + +##################################################### +# EXPORT +##################################################### + + +def export_data(lf: pl.LazyFrame): + """Export gene expression data.""" + logger.info(f"Exporting normalised counts to: {ALL_COUNTS_PARQUET_OUTFILENAME}") + lf.sink_parquet(ALL_COUNTS_PARQUET_OUTFILENAME) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + # parsing count files + count_files = [Path(file) for file in args.count_files.split(" ")] + logger.info(f"Merging {len(count_files)} count files") + + lfs = scan_counts(count_files) + + # collecting all gene ids from all lazyframes into a dataframe with one column + gene_id_df = collect_all_gene_ids(lfs) + + # performing a left join between the sorted list of gene if and each collected lazyframe separately + # writing this sorted dataframe in a tmp file + tmp_files = make_tmp_sorted_dataframes(lfs, gene_id_df) + + # scanning the newly created tmp files + lfs = scan_counts(tmp_files) + + # these files are ready to be merged directly through horizontal concatenation + # setting strict=True requires all DataFrames to be the same height, raising an error if not. + merged_lf = pl.concat([gene_id_df.lazy()] + lfs, how="horizontal", strict=True) + + # performing some cleaning / formating operations + merged_lf = formating_counts(merged_lf) + + # exporting merged data in streaming mode + export_data(merged_lf) + + # cleaning up tmp files + for tmp_file in tmp_files: + tmp_file.unlink() + + +if __name__ == "__main__": + main() diff --git a/bin/natural_language_utils.py b/bin/natural_language_utils.py new file mode 100755 index 00000000..79f8463c --- /dev/null +++ b/bin/natural_language_utils.py @@ -0,0 +1,139 @@ +import nltk +from nltk.corpus import wordnet + +nltk.download("punkt_tab") +nltk.download("averaged_perceptron_tagger_eng") +nltk.download("wordnet") + +lemmatizer = nltk.WordNetLemmatizer() +stemmer = nltk.PorterStemmer() + + +def get_wordnet_pos(token: str) -> str: + tag = nltk.pos_tag([token])[0][1][0].upper() + tag_dict = { + "J": wordnet.ADJ, + "N": wordnet.NOUN, + "V": wordnet.VERB, + "R": wordnet.ADV, + } + return tag_dict.get(tag, wordnet.NOUN) # Default to NOUN if not found + + +def get_stemmed_tokens(sentence: str) -> list[str]: + """ + Tokenize a sentence into its constituent words, and then stem each word + + Parameters + ---------- + sentence : str + The sentence to be tokenized and stemmed + + Returns + ------- + tokens : List[str] + The list of stemmed tokens + """ + + tokens = nltk.word_tokenize(sentence) + return [stemmer.stem(token) for token in tokens] + + +def get_lemmed_tokens(sentence: str) -> list[str]: + """ + Tokenize a sentence into its constituent words, and then lemmatize each word + + Parameters + ---------- + sentence : str + The sentence to be tokenized and lemmatized + + Returns + ------- + tokens : List[str] + The list of lemmatized tokens + """ + tokens = nltk.word_tokenize(sentence) + return [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens] + + +def get_synonyms(word) -> set[str]: + """ + Get all synonyms of a word from the wordnet database. + + Parameters + ---------- + word : str + The word for which to get synonyms + + Returns + ------- + synonyms : set + A set of all synonyms of the word + """ + synonyms = [] + for syn in wordnet.synsets(word): + for lemma in syn.lemmas(): + synonyms.append(lemma.name()) # Get the name of each lemma (synonym) + return set(synonyms) # Return as a set to avoid duplicates + + +def get_all_candidate_target_words(sentence: str) -> list[str]: + """ + Get all candidate target words from a sentence by stemming and lemmatizing the + tokens and getting synonyms from the wordnet database. + + Parameters + ---------- + sentence : str + The sentence from which to get candidate target words + + Returns + ------- + candidates : list + A list of all candidate target words + """ + candidates = [] + lemmatized_tokens = get_stemmed_tokens(sentence) + stemmed_tokens = get_stemmed_tokens(sentence) + tokens = list(set(lemmatized_tokens + stemmed_tokens)) + for token in tokens: + candidates += get_synonyms(token) + return candidates + + +def word_is_in_sentence(word: str, sentence: str) -> bool: + """ + Check if a word (or a stemmed version of it) is in a sentence, or if it is a + subword of a stemmed version of any word in the sentence. + + Parameters + ---------- + word : str + The word to be searched for + sentence : str + The sentence in which to search for the word + + Returns + ------- + bool + True if the word is found in the sentence, False otherwise + """ + for stemmed_word in [word] + get_stemmed_tokens(word): + # testing if stemmed word is in sentence as it is + if stemmed_word in sentence: + return True + # or testing if stemmed word is a subword of a stemmed word from the sentence + for target_word in get_all_candidate_target_words(sentence): + if stemmed_word in target_word: + return True + return False + + +def keywords_in_fields(fields: list[str], keywords: list[str]) -> list[str]: + return [ + keyword + for keyword in keywords + for field in fields + if word_is_in_sentence(keyword, field) + ] diff --git a/bin/normalise_microarray.R b/bin/normalise_microarray.R new file mode 100755 index 00000000..f9343ebf --- /dev/null +++ b/bin/normalise_microarray.R @@ -0,0 +1,129 @@ +#!/usr/bin/env Rscript + +# Written by Olivier Coen. Released under the MIT license. + +suppressPackageStartupMessages(library("affy")) +suppressPackageStartupMessages(library("optparse")) +suppressPackageStartupMessages(library("AnnotationDbi")) +suppressPackageStartupMessages(library("dplyr")) + +# Load library +library(affy) +library(optparse) +library(AnnotationDbi) +library(dplyr) +library(tibble) + +options(error = traceback) + +# we need to install the affy package manually while disabling threading +# when installed through conda, we get: ERROR; return code from pthread_create() is 22 +if (!requireNamespace("affy", quietly = TRUE)) { + BiocManager::install("affy", configure.args="--disable-threading", force = TRUE, quiet = TRUE) +} + + +##################################################### +##################################################### +# ARG PARSER +##################################################### +##################################################### + +get_args <- function() { + option_list <- list( + make_option("--input", help = "Folder containing CEL files"), + make_option("--target-gene-id-db", dest = "target_gene_id_db", help = "Target database for gene IDs (ENSEMBL or ENTREZID)") + ) + + args <- parse_args(OptionParser( + option_list = option_list, + description = "Normalize microarray data using RMA" + )) + return(args) +} + +get_probe_id_mapping <- function(data, annot_db, target_gene_id_db, stringent) { + + probe_ids <- rownames(data) + annotations <- AnnotationDbi::select( + annot_db, + keys = probe_ids, + columns = c(target_gene_id_db), + keytype = "PROBEID" + ) + + if (stringent) { + annotations <- annotations %>% + group_by(PROBEID) %>% + filter(n_distinct(.data[[target_gene_id_db]], na.rm = TRUE) == 1) %>% + ungroup() + } + + return(annotations) +} + +replace_probe_ids_by_target_ids <- function(data, annotations, target_gene_id_db) { + data <- as.data.frame(data) + data$PROBEID <- rownames(data) + + data <- merge(annotations, data, by = "PROBEID", all.x = TRUE) + + # computing mean of probe values for each gene + data <- data %>% + group_by(.data[[target_gene_id_db]]) %>% + summarise(across(where(is.numeric), function(x) mean(x, na.rm = TRUE))) %>% + ungroup() + + data <- tibble::column_to_rownames(data, var = target_gene_id_db) + return(data) +} + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +main <- function() { + + args <- get_args() + + # Read CEL files from a directory + message("Reading CEL files from", args$input) + data <- ReadAffy(celfile.path = args$input) + + message("Installing annotation database") + db_name <- paste0(annotation(data), ".db") + if (!requireNamespace(db_name, quietly = TRUE)) { + BiocManager::install(db_name, quiet = TRUE) + } + library(db_name, character.only = TRUE) + + # Normalize using RMA (most common method) + eset <- rma(data) + # Extract normalized expression values + message("Extracting normalized expression values") + normalised_data <- exprs(eset) + + annotations <- get_probe_id_mapping( + normalised_data, + annot_db = get(db_name), # Get the database object using get() + target_gene_id_db = args$target_gene_id_db, + stringent = TRUE + ) + + normalised_data_df <- replace_probe_ids_by_target_ids(normalised_data, annotations, args$target_gene_id_db) + + # cleaning colnames + colnames(normalised_data_df) <- sub("\\..*", "", colnames(normalised_data_df)) + colnames(normalised_data_df) <- sub("-", "_", colnames(normalised_data_df)) + + # Save results + message("Saving results to normalised_expression.csv") + write.csv(normalised_data_df, "normalised_expression.csv") + +} + +main() diff --git a/bin/normfinder.py b/bin/normfinder.py new file mode 100755 index 00000000..7cd49282 --- /dev/null +++ b/bin/normfinder.py @@ -0,0 +1,517 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +import sys +from dataclasses import dataclass, field +from pathlib import Path +from statistics import mean + +import config +import numpy as np +import polars as pl +from common import write_float_csv +from numba import njit, prange +from tqdm import tqdm + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +STABILITY_OUTFILENAME = "stability_values.normfinder.csv" + + +############################################################################ +# POLARS EXTENSIONS +############################################################################ + + +@pl.api.register_expr_namespace("row") +class StatsExtension: + def __init__(self, expr: pl.Expr): + self._expr = expr + + def not_null_values(self): + return self._expr.list.eval(pl.element().drop_nulls().drop_nans()).list + + def mean(self) -> pl.Expr: + """Mean over non nulls values in row""" + return self.not_null_values().mean() + + def sum(self) -> pl.Expr: + """Median over non nulls values in row""" + return self.not_null_values().sum() + + def min(self) -> pl.Expr: + """Median over non nulls values in row""" + return self.not_null_values().min() + + +############################################################################ +# NUMBA-ACCELERATED FUNCTIONS +############################################################################ + + +@njit(parallel=True) +def compute_minvars(z: np.ndarray, target_idx: np.ndarray) -> np.ndarray: + """ + z: (ngenes, nsamples) array + target_idx: 1D array of indices (int64) for which to compute minvar + returns: 1D array of length len(target_idx) + """ + ngenes, nsamples = z.shape + + # should not happen as it is controlled before, but just in case + if nsamples < 2: + raise ValueError("Number of samples must be at least 2") + + minvars = np.empty(len(target_idx), dtype=np.float64) + for k in prange(len(target_idx)): + i = target_idx[k] + # checking if counts for this gene are all nans + nb_valid_counts = (~np.isnan(z[i, :])).sum() + if nb_valid_counts < 1: + minvars[k] = np.nan + continue # skip this gene + # computing variances of pairwise differences + minv = 1e18 + for j in prange(ngenes): + if i == j: + continue + diffs = z[i, :] - z[j, :] + mean = np.sum(diffs) / nsamples # scalar + var = np.sum((diffs - mean) ** 2) / (nsamples - 1) # scalar + if np.isnan(var): + continue # skip + if var < minv: + minv = var + minvars[k] = minv / 4.0 if minv < 1e18 else np.inf + return minvars + + +##################################################### +# NORMFINDER CLASS +##################################################### + + +@dataclass +class NormFinder: + count_lf: pl.LazyFrame + design_df: pl.DataFrame + + genes: list[str] = field(init=False) + + group_to_samples_dict: dict[str, list[str]] = field(init=False) + + n_groups: int = field(init=False) + n_genes: int = field(init=False) + + def __post_init__(self): + # format_design + self.design_df = self.design_df.with_columns( + pl.concat_str([pl.col("batch"), pl.col("condition")], separator="_").alias( + "group" + ) + ).select("sample", "group") + + # make dict associating a group to the list of its samples + group_to_sample_df = self.design_df.group_by("group", maintain_order=True).agg( + "sample" + ) # maintain order is better for repeatability and testing + + self.group_to_samples_dict = { + d["group"]: d["sample"] for d in group_to_sample_df.to_dicts() + } + + groups = list(self.group_to_samples_dict.keys()) + self.n_groups = len(groups) + + self.genes = ( + self.count_lf.select(config.GENE_ID_COLNAME).collect().to_series().to_list() + ) + self.n_genes = len(self.genes) + + if self.n_genes <= 2: + logger.error("Too few genes") + sys.exit(100) + + @staticmethod + def get_overall_mean_for_group(df_with_means_over_samples: pl.DataFrame) -> float: + return df_with_means_over_samples.mean().item() + + @staticmethod + def get_means_over_samples(df: pl.DataFrame) -> pl.DataFrame: + return df.with_columns( + mean_over_samples_for_gene=pl.concat_list(pl.all()).row.mean() + ).select("mean_over_samples_for_gene") + + def correct_negative_values( + self, intra_var_df: pl.DataFrame, group_count_df: pl.DataFrame + ) -> pl.DataFrame: + genes_with_negative_values = intra_var_df.select( + col for col in self.genes if (intra_var_df[col] < 0).all() + ).columns # intra_var_df has only one row but it is a dataframe + + # getting indexes of genes for which we must compute minvar + indexes_of_genes_with_negative_values = np.array( + [ + i + for i, gene in enumerate(self.genes) + if gene in genes_with_negative_values + ], + dtype=np.int64, + ) + + minvars = compute_minvars( + group_count_df.to_numpy(), indexes_of_genes_with_negative_values + ) + + # associating back minvars to their respective gene + minvar_dict = { + gene: minvars[i] for i, gene in enumerate(genes_with_negative_values) + } + return intra_var_df.with_columns( + [pl.lit(val).alias(col) for col, val in minvar_dict.items()] + ) + + def get_unbiased_intragroup_variance_for_group( + self, + group_count_df: pl.DataFrame, + means_over_samples_df: pl.DataFrame, + group_overall_mean: float, + samples: list[str], + ): + # TODO: see if it's correct + # if only one sample in the group, there's no variance + if len(samples) == 1: + data = {gene: [0] for gene in self.genes} + return pl.DataFrame(data) + + # lf is a lazyframe with a column being the gene ids (gene_id) + # and other columns being the samples + # the current chunk corresponds to only one group + # means_over_samples_df is a single column dataframe containing the means across each row (ie for each gene across samples) + ng = len(samples) + + means_over_samples = means_over_samples_df.to_series().rename( + "mean_over_samples_for_gene" + ) + + mean_over_genes = ( + group_count_df.mean() + .transpose() + .to_series() + .rename("mean_over_genes_for_sample") + ) + + sample_variance_df = ( + group_count_df.hstack( + [means_over_samples] + ) # adding column containing means over all samples in this group (for each gene) + .select( + [ + (pl.col(c) - pl.col("mean_over_samples_for_gene")).alias( + c + ) # y_igj - mean(y_ig*) + for c in samples + ] + ) + .transpose( + include_header=True, column_names=self.genes + ) # columns are now genes + .hstack( + [mean_over_genes] + ) # adding column containing means over all genes (for each sample) + .select( + [ + ( + ( + pl.col(c) + - pl.col("mean_over_genes_for_sample") + + group_overall_mean + ) + ** 2 + ).alias( + c + ) # r_igj ^2 = (y_igj - mean(y_ig*) -mean(y_*gj) + mean(y_*g*) ) ^ 2 + for c in self.genes + ] + ) + .transpose(include_header=True, column_names=samples) + .with_columns( + sample_variance=pl.concat_list(samples).row.sum() + / ( + (ng - 1) * (1 - 2 / self.n_genes) + ) # sum over j (samples) of r_igj ^2 terms + ) + .select("sample_variance") + .transpose() + .rename({f"column_{i}": gene for i, gene in enumerate(self.genes)}) + ) + + # sum of all sample variances for all genes + sample_variance_sum_over_genes = sample_variance_df.select( + pl.sum_horizontal(pl.all()) + ).item() # sum of all s_ij² over all genes + + intra_var_df = sample_variance_df.select( + [ + ( + pl.col(c) + - sample_variance_sum_over_genes + / (self.n_genes * (self.n_genes - 1)) + ).alias(c) + for c in self.genes + ] + ) + # if some values are negative, we need a special process + corrected_intra_var_df = self.correct_negative_values( + intra_var_df, group_count_df + ) + + return corrected_intra_var_df + + def get_unbiased_intragroup_variances(self): + unbiased_intragroup_variance_dfs = [] + means_over_samples_dfs = [] + group_overall_means = [] + + for group, samples in tqdm(self.group_to_samples_dict.items()): + # sub dataframe corresponding to this group + chunk_df = self.count_lf.select(samples).collect() + # computing means over samples for each gene + means_over_samples_df = self.get_means_over_samples(chunk_df) + # getting overall expression average in the group for all genes + group_overall_mean = self.get_overall_mean_for_group(means_over_samples_df) + + group_unbiased_intragroup_variance_df = ( + self.get_unbiased_intragroup_variance_for_group( + chunk_df, means_over_samples_df, group_overall_mean, samples + ) + ) + + # storing intragroup values for each gene in this group + unbiased_intragroup_variance_dfs.append( + group_unbiased_intragroup_variance_df + ) + # storing means over samples in this group for each gene + means_over_samples_df = means_over_samples_df.rename( + {"mean_over_samples_for_gene": group} + ) + means_over_samples_dfs.append(means_over_samples_df) + # storing overall mean of expression in this group, for all genes and samples + group_overall_means.append(group_overall_mean) + + # cast all values to float (to avoid issues when concat) + unbiased_intragroup_variance_dfs = [ + df.select([pl.col(col).cast(pl.Float64) for col in df.columns]) + for df in unbiased_intragroup_variance_dfs + ] + + # removing None values in group_overall_means + # which would originate from group chunk dataframes that are full of null values + group_overall_means = [mean for mean in group_overall_means if mean is not None] + + # before returning: + # concatenate together all intragroup variance data to have a single df for all groups + # stack all means over samples horizontally (becomes a gene * group df ) + # get the mean of group_overall_means to get the overall mean expression value in the count dataframe + return ( + pl.concat(unbiased_intragroup_variance_dfs), + pl.concat(means_over_samples_dfs, how="horizontal"), + mean(group_overall_means), + ) + + def adjust_for_nb_of_samples_in_groups( + self, unbiased_intragroup_variance_df: pl.DataFrame + ): + n_samples_list = [ + len(samples) for samples in self.group_to_samples_dict.values() + ] + return unbiased_intragroup_variance_df.with_columns( + n_samples=pl.Series(n_samples_list) + ).select([(pl.col(c) / pl.col("n_samples")).alias(c) for c in self.genes]) + + def get_unbiased_intergroup_variance( + self, gene_means_in_groups_df: pl.DataFrame, dataset_overall_mean: float + ): + mean_over_genes = ( + gene_means_in_groups_df.mean() + .transpose() + .to_series() + .rename("mean_over_genes_for_group") + ) + + return ( + gene_means_in_groups_df.with_columns( + mean_over_groups_for_gene=pl.concat_list(pl.all()).row.mean() + ) + .select( + [ + (pl.col(c) - pl.col("mean_over_groups_for_gene")).alias(c) + for c in gene_means_in_groups_df.columns + ] + ) + .transpose(column_names=self.genes) + .hstack([mean_over_genes]) + .select( + [ + ( + pl.col(c) + - pl.col("mean_over_genes_for_group") + + dataset_overall_mean + ).alias(c) + for c in self.genes + ] + ) + .select( + [(pl.col(c) ** 2).alias(c) for c in self.genes] + ) # square to get variance + ) + + def compute_gamma_factor(self, diff_df: pl.DataFrame, vardiff_df: pl.DataFrame): + logger.info("Computing gamma factor") + first_term = ( + diff_df.with_columns( + sum_of_squares=pl.concat_list(pl.all()).row.sum() # sum over columns + ) + .select("sum_of_squares") + .sum() # sum over rows + .select( + ( + pl.col("sum_of_squares") + / ((self.n_groups - 1) * (self.n_genes - 1)) + ).alias("normalised_sum_of_squares") + ) + .item() + ) + + second_term = ( + vardiff_df.with_columns( + sum=pl.concat_list(pl.all()).row.sum() # sum over columns + ) + .select("sum") + .sum() # sum over rows + .select( + (pl.col("sum") / (self.n_groups * self.n_genes)).alias("normalised_sum") + ) + .item() + ) + + return max(first_term - second_term, 0) # set to 0 if negative + + @staticmethod + def apply_gamma_factor( + gamma: float, diff_df: pl.DataFrame, vardiff_df: pl.DataFrame + ): + difnew = diff_df * gamma / (gamma + vardiff_df) + varnew = vardiff_df + gamma * vardiff_df / (gamma + vardiff_df) + return difnew, varnew + + def apply_shrinkage( + self, intergroup_variance_df: pl.DataFrame, group_mean_variance_df: pl.DataFrame + ): + gamma = self.compute_gamma_factor( + intergroup_variance_df, group_mean_variance_df + ) + return self.apply_gamma_factor( + gamma, intergroup_variance_df, group_mean_variance_df + ) + + def get_stability_values( + self, shrunk_intervar_df: pl.DataFrame, shrunk_gr_mean_var_df: pl.DataFrame + ): + return ( + ( + shrunk_intervar_df.select([pl.col(c).abs() for c in self.genes]) + + shrunk_gr_mean_var_df.select([pl.col(c).sqrt() for c in self.genes]) + ) + .mean() + .transpose( + include_header=True, + header_name=config.GENE_ID_COLNAME, + column_names=[config.NORMFINDER_STABILITY_VALUE_COLNAME], + ) + ) + + def compute_stability_scoring(self): + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # UNBIASED INTRAGROUP VARIANCE + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + logger.info("Computing intragroup variances") + intragroup_variance_df, gene_means_in_groups_df, dataset_overall_mean = ( + self.get_unbiased_intragroup_variances() + ) + + logger.info("Adjusting variances by group size") + group_mean_variance_df = self.adjust_for_nb_of_samples_in_groups( + intragroup_variance_df + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # INTERGROUP VARIANCE + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + logger.info("Computing intergroup variances") + intergroup_variance_df = self.get_unbiased_intergroup_variance( + gene_means_in_groups_df, dataset_overall_mean + ) + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # STABILITY VALUES + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + logger.info("Shrinking intragroup and intergroup variances using gamma factor") + shrunk_intervar_df, shrunk_gr_mean_var_df = self.apply_shrinkage( + intergroup_variance_df, group_mean_variance_df + ) + + logger.info("Computing stability values") + return self.get_stability_values(shrunk_intervar_df, shrunk_gr_mean_var_df) + + +##################################################### +# FUNCTIONS +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Quantile normalise count data for each sample in the dataset" + ) + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--design", type=Path, dest="design_file", required=True, help="Design file" + ) + return parser.parse_args() + + +def export_stability(stabilities: pl.DataFrame): + """Export stability values to CSV file.""" + logger.info(f"Exporting stability values to: {STABILITY_OUTFILENAME}") + write_float_csv(stabilities, STABILITY_OUTFILENAME) + + +def main(): + args = parse_args() + + logger.info(f"Getting counts from {args.count_file}") + count_lf = pl.scan_parquet(args.count_file) + + logger.info(f"Getting design from {args.design_file}") + design_df = pl.read_csv(args.design_file) + # filter design df to keep only samples that are present in the count dataframe + design_df = design_df.filter( + pl.col("sample").is_in(count_lf.collect_schema().names()) + ) + + nfd = NormFinder(count_lf, design_df) + stabilities = nfd.compute_stability_scoring() + + logger.info(f"Stability values:\n{stabilities}") + export_stability(stabilities) + + +if __name__ == "__main__": + main() diff --git a/bin/quantile_normalise.py b/bin/quantile_normalise.py new file mode 100755 index 00000000..5b26f4d9 --- /dev/null +++ b/bin/quantile_normalise.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +# Written by Olivier Coen. Released under the MIT license. + +import argparse +import logging +from pathlib import Path + +import config +import polars as pl +from common import export_parquet, parse_count_table +from sklearn.preprocessing import quantile_transform + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +OUTFILE_SUFFIX = ".quant_norm.parquet" + +N_QUANTILES = 1000 + +ALLOWED_TARGET_DISTRIBUTIONS = ["normal", "uniform"] + + +##################################################### +##################################################### +# FUNCTIONS +##################################################### +##################################################### + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Quantile normalise count data for each sample in the dataset" + ) + parser.add_argument( + "--counts", type=Path, dest="count_file", required=True, help="Count file" + ) + parser.add_argument( + "--target-distrib", + type=str, + dest="target_distribution", + required=True, + choices=ALLOWED_TARGET_DISTRIBUTIONS, + help="Target distribution to map counts to", + ) + return parser.parse_args() + + +def quantile_normalise(df: pl.DataFrame, target_distribution: str): + """ + Quantile normalize a dataframe; column by column, based on a target distribution. + """ + kwargs = dict( + n_quantiles=N_QUANTILES, output_distribution=target_distribution, subsample=None + ) + return df.with_columns( + pl.exclude(config.GENE_ID_COLNAME).map_batches( + lambda x: quantile_transform(x.to_frame(), **kwargs).flatten(), + return_dtype=pl.Float64, + ) + ) + + +##################################################### +##################################################### +# MAIN +##################################################### +##################################################### + + +def main(): + args = parse_args() + + logger.info(f"Parsing {args.count_file.name}") + count_df = parse_count_table(args.count_file) + + logger.info(f"Quantile normalising {args.count_file.name}") + quantile_normalized_counts = quantile_normalise(count_df, args.target_distribution) + + export_parquet(quantile_normalized_counts, args.count_file, OUTFILE_SUFFIX) + + +if __name__ == "__main__": + main() diff --git a/conf/base.config b/conf/base.config index bfc5fe6d..12e88c25 100644 --- a/conf/base.config +++ b/conf/base.config @@ -8,16 +8,41 @@ ---------------------------------------------------------------------------------------- */ +executor { + cpus = 8 + memory = 24.GB +} + process { - // TODO nf-core: Check the defaults for all processes + resourceLimits = [ + cpus: 16, + memory: '24.GB', + time: '4.h' + ] + cpus = { 1 * task.attempt } memory = { 6.GB * task.attempt } time = { 4.h * task.attempt } - errorStrategy = { task.exitStatus in ((130..145) + 104 + 175) ? 'retry' : 'finish' } - maxRetries = 1 - maxErrors = '-1' + errorStrategy = { + if (task.exitStatus == 100) { // managed errors that should not be retried but ignored at once + 'ignore' + } else if (task.exitStatus == 101) { // connection errors that should be retried + 'retry' + } else if (task.exitStatus in ((130..145) + 104 + 175)) { // OOM & related errors; should be retried as long as memory does not fit + sleep(Math.pow(2, task.attempt) * 200 as long) + 'retry' + } else if (task.attempt <= 3) { // all other errors should be retried with exponential backoff with max retry = 3 + sleep(Math.pow(2, task.attempt) * 200 as long) + 'retry' + } else { // after 3 retries, ignore the error + 'terminate' + } + } + + maxRetries = 10 + maxErrors = '-1' // Process-specific resource requirements // NOTE - Please try and reuse the labels below as much as possible. @@ -28,39 +53,40 @@ process { // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { cpus = { 1 } - memory = { 6.GB * task.attempt } - time = { 4.h * task.attempt } + memory = { 2.GB * task.attempt } + time = { 1.h * task.attempt } } withLabel:process_low { - cpus = { 2 * task.attempt } - memory = { 12.GB * task.attempt } - time = { 4.h * task.attempt } + cpus = { 2 } + memory = { 4.GB + 2.GB * task.attempt } + time = { 2.h * task.attempt } } withLabel:process_medium { - cpus = { 6 * task.attempt } - memory = { 36.GB * task.attempt } - time = { 8.h * task.attempt } + cpus = { 4 } + memory = { 6.GB + 2.GB * task.attempt } + time = { 4.h * task.attempt } } withLabel:process_high { - cpus = { 12 * task.attempt } - memory = { 72.GB * task.attempt } - time = { 16.h * task.attempt } - } - withLabel:process_long { - time = { 20.h * task.attempt } - } - withLabel:process_high_memory { - memory = { 200.GB * task.attempt } + cpus = { 4 } + memory = { 8.GB + 4.GB * task.attempt } + time = { 8.h * task.attempt } } - withLabel:error_ignore { - errorStrategy = 'ignore' - } - withLabel:error_retry { - errorStrategy = 'retry' - maxRetries = 2 - } - withLabel: process_gpu { - ext.use_gpu = { workflow.profile.contains('gpu') } - accelerator = { workflow.profile.contains('gpu') ? 1 : null } + withLabel:can_fail { + errorStrategy = { + if (task.exitStatus == 100) { // managed errors that should not be retried but ignored at once + 'ignore' + } else if (task.exitStatus == 101) { // connection errors that should be retried + 'retry' + } else if (task.exitStatus in ((130..145) + 104 + 175)) { // OOM & related errors; should be retried as long as memory does not fit + sleep(Math.pow(2, task.attempt) * 200 as long) + 'retry' + } else if (task.attempt <= 3) { // all other errors should be retried with exponential backoff with max retry = 3 + sleep(Math.pow(2, task.attempt) * 200 as long) + 'retry' + } else { // after 3 retries, ignore the error + 'ignore' + } + } } + } diff --git a/conf/modules.config b/conf/modules.config index f0b0d55a..70604b85 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -10,21 +10,19 @@ ---------------------------------------------------------------------------------------- */ -process { - - publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] +/* +publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } +] +*/ - withName: 'MULTIQC' { - ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } - publishDir = [ - path: { "${params.outdir}/multiqc" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } -} +includeConfig 'modules/public_data.config' +includeConfig 'modules/id_mapping.config' +includeConfig 'modules/gene_length.config' +includeConfig 'modules/normalisation.config' +includeConfig 'modules/merging.config' +includeConfig 'modules/statistics.config' +includeConfig 'modules/reporting.config' diff --git a/conf/modules/gene_length.config b/conf/modules/gene_length.config new file mode 100644 index 00000000..4fa9b8be --- /dev/null +++ b/conf/modules/gene_length.config @@ -0,0 +1,17 @@ +process { + + withName: DOWNLOAD_ENSEMBL_ANNOTATION { + publishDir = [ + path: { "${params.outdir}/gene_length" }, + mode: params.publish_dir_mode + ] + } + + withName: COMPUTE_GENE_TRANSCRIPT_LENGTHS { + publishDir = [ + path: { "${params.outdir}/gene_length" }, + mode: params.publish_dir_mode + ] + } + +} diff --git a/conf/modules/id_mapping.config b/conf/modules/id_mapping.config new file mode 100644 index 00000000..4e048a37 --- /dev/null +++ b/conf/modules/id_mapping.config @@ -0,0 +1,27 @@ +process { + + withName: COLLECT_GENE_IDS { + publishDir = [ + path: { "${params.outdir}/idmapping/collected_gene_ids" }, + mode: params.publish_dir_mode + ] + } + + withName: GPROFILER_IDMAPPING { + publishDir = [ + path: { "${params.outdir}/idmapping/gprofiler" }, + mode: params.publish_dir_mode + ] + } + + withName: FILTER_AND_RENAME_GENES { + publishDir = [ + path: { "${params.outdir}/idmapping/renamed" }, + mode: params.publish_dir_mode, + saveAs: { + filename -> ["failure_reason.txt", "warning_reason.txt"].contains(filename) ? null : filename + } + ] + } + +} diff --git a/conf/modules/merging.config b/conf/modules/merging.config new file mode 100644 index 00000000..f9c1e46d --- /dev/null +++ b/conf/modules/merging.config @@ -0,0 +1,8 @@ +process { + + withName: 'NFCORE_STABLEEXPRESSION:STABLEEXPRESSION:MERGE_DATA:PLATFORM' { + tag = { "${meta.platform}" } + maxForks = 1 + } + +} diff --git a/conf/modules/normalisation.config b/conf/modules/normalisation.config new file mode 100644 index 00000000..94aa5dc7 --- /dev/null +++ b/conf/modules/normalisation.config @@ -0,0 +1,30 @@ +process { + + withName: COMPUTE_CPM { + publishDir = [ + path: { "${params.outdir}/normalised/cpm/${meta.dataset}/" }, + mode: params.publish_dir_mode, + saveAs: { + filename -> ["failure_reason.txt", "warning_reason.txt"].contains(filename) ? null : filename + } + ] + } + + withName: COMPUTE_TPM { + publishDir = [ + path: { "${params.outdir}/normalised/tpm/${meta.dataset}/" }, + mode: params.publish_dir_mode, + saveAs: { + filename -> ["failure_reason.txt", "warning_reason.txt"].contains(filename) ? null : filename + } + ] + } + + withName: QUANTILE_NORMALISATION { + publishDir = [ + path: { "${params.outdir}/normalised/quantile_normalised/${meta.dataset}/" }, + mode: params.publish_dir_mode + ] + } + +} diff --git a/conf/modules/public_data.config b/conf/modules/public_data.config new file mode 100644 index 00000000..9198c9e2 --- /dev/null +++ b/conf/modules/public_data.config @@ -0,0 +1,39 @@ +process { + + withName: EXPRESSIONATLAS_GETACCESSIONS { + publishDir = [ + path: { "${params.outdir}/public_data/expression_atlas/accessions/" }, + mode: params.publish_dir_mode + ] + } + + withName: EXPRESSIONATLAS_GETDATA { + + publishDir = [ + path: { "${params.outdir}/public_data/expression_atlas/datasets/" }, + mode: params.publish_dir_mode, + saveAs: { + filename -> ["failure_reason.txt", "warning_reason.txt"].contains(filename) ? null : filename + } + ] + + } + + withName: GEO_GETACCESSIONS { + publishDir = [ + path: { "${params.outdir}/public_data/geo/accessions/" }, + mode: params.publish_dir_mode + ] + } + + withName: GEO_GETDATA { + publishDir = [ + path: { "${params.outdir}/public_data/geo/datasets/" }, + mode: params.publish_dir_mode, + saveAs: { + filename -> ["failure_reason.txt", "warning_reason.txt"].contains(filename) ? null : filename + } + ] + } + +} diff --git a/conf/modules/reporting.config b/conf/modules/reporting.config new file mode 100644 index 00000000..afb22c3f --- /dev/null +++ b/conf/modules/reporting.config @@ -0,0 +1,30 @@ +process { + + withName: AGGREGATE_RESULTS { + publishDir = [ + path: { "${params.outdir}/aggregated" }, + mode: params.publish_dir_mode + ] + } + + withName: 'MULTIQC' { + cpus = { 4 } + memory = { 8.GB * task.attempt } + ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: 'copy' + ] + } + + withName: 'DASH_APP' { + publishDir = [ + path: { "${params.outdir}/dash_app/" }, + mode: 'copy', + saveAs: { + filename -> ['versions.yml', 'file_system_backend'].contains(filename) ? null : filename + } + ] + } + +} diff --git a/conf/modules/statistics.config b/conf/modules/statistics.config new file mode 100644 index 00000000..8527990e --- /dev/null +++ b/conf/modules/statistics.config @@ -0,0 +1,8 @@ +process { + + withName: 'NFCORE_STABLEEXPRESSION:STABLEEXPRESSION:GENE_STATISTICS:PLATFORM' { + tag = { "${meta.platform}" } + maxForks = 1 + } + +} diff --git a/conf/test.config b/conf/test.config index 42de053a..dc27bed9 100644 --- a/conf/test.config +++ b/conf/test.config @@ -3,27 +3,19 @@ Nextflow config file for running minimal tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Defines input files and everything required to run a fast and simple pipeline test. + It tests the different ways to use the pipeline, with small data Use as follows: - nextflow run nf-core/stableexpression -profile test, --outdir + nextflow run nf-core/stableexpression -profile test_dataset, --outdir ---------------------------------------------------------------------------------------- */ -process { - resourceLimits = [ - cpus: 4, - memory: '15.GB', - time: '1.h' - ] -} - params { - config_profile_name = 'Test profile' + config_profile_name = 'Test dataset profile' config_profile_description = 'Minimal test dataset to check pipeline function' // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' + species = 'prunus persica' + outdir = "results/test" } diff --git a/conf/test_dataset_eatlas.config b/conf/test_dataset_eatlas.config new file mode 100644 index 00000000..c46350ad --- /dev/null +++ b/conf/test_dataset_eatlas.config @@ -0,0 +1,25 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running full-size tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a full size pipeline test. + This tests the capacity of the pipeline to process a full size dataset. + + Use as follows: + nextflow run nf-core/stableexpression -profile test_full, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Full test profile' + config_profile_description = 'Full test dataset to check pipeline function' + + // Input data + species = 'mus_musculus' + accessions = "E-MTAB-2262" + skip_fetch_eatlas_accessions = true + fetch_geo_accessions = false + datasets = 'https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/input_big.yaml' + outdir = "results/test_dataset_eatlas" +} diff --git a/conf/test_full.config b/conf/test_full.config index d7bc0dad..316cd153 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -3,6 +3,7 @@ Nextflow config file for running full-size tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Defines input files and everything required to run a full size pipeline test. + This tests the capacity of the pipeline to process a full size dataset. Use as follows: nextflow run nf-core/stableexpression -profile test_full, --outdir @@ -14,11 +15,7 @@ params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' - // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' - - // Fasta references - fasta = params.pipelines_testdata_base_path + 'viralrecon/genome/NC_045512.2/GCF_009858895.2_ASM985889v3_genomic.200409.fna.gz' + // Input data + species = 'arabidopsis_thaliana' + outdir = "results/test_full" } diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 00000000..1b9ff481 --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,137 @@ +# Pipeline configuration + +Although many parameters are directly exposed to the user via the CLI, setting cpu and memory requirements must be done via a configuration file. +By default, the max number of CPUs and memory the pipeline can use is defined in `conf/base.config`: + +``` +executor { + cpus = 8 + memory = 24.GB +} +``` + +This was set quite low on purpose, in order to make it run easily on most data science laptops. + +## Setting hard limits of CPU and memory + +One can modify it by creating a custom config file: + +``` +executor { + cpus = 32 + memory = 200.GB +} +``` + +then launch the pipeline using: + +```bash +nextflow run nf-core/stableexpression \ + -profile \ + -config + ... +``` + +## Modifying resource allocation for processes + +Let's say you have a laptop with only 4 CPUs and 12 GB of RAM. Running the pipeline may crash your computer because of a lack of memory. +To tell the pipeline to lower down its resource consumption, you can create a custom config file with: + +``` +executor { + cpus = 4 + memory = 8.GB +} + +withLabel:process_single { + memory = { 2.GB * task.attempt } +} +withLabel:process_low { + memory = { 2.GB + 1.GB * task.attempt } +} +withLabel:process_medium { + memory = { 4.GB + 1.GB * task.attempt } +} +withLabel:process_high { + memory = { 4.GB + 2.GB * task.attempt } +} +``` + +then launch the pipeline using: + +```bash +nextflow run nf-core/stableexpression \ + -profile \ + -config + ... +``` + +> [!WARNING] +> Please keep in mind that if the total number of datasets (downloaded from public datasets or directly provided by the user) is too big for your computer, the pipeline will crash. Even if much effort was made to minimise the memory usage, some steps still require a certain amount of memory to run successfully. + +## Running with Slurm + +This is an example `launch_nf_core_stableexpression.sh` script to run the pipeline on an HPC cluster with Slurm: + +```bash +#!/bin/bash + +# set job name +#SBATCH --job-name=nf_run + +# set output file for logs +#SBATCH --output=logs/nf_run_%j.log + +# if your HPC cluster uses partitions, use a partition allowing for long runs +#SBATCH --partition= + +#to get email notifications +#SBATCH --mail-user= +#SBATCH --mail-type=END,FAIL + +# set max memory available to the Nextflow main node +#SBATCH --mem 2GB + +module load nextflow +# or load specific version: module load nextflow=25.10.04 + +# set location of apptainer cache directory +export NXF_APPTAINER_CACHEDIR=apptainer_cache + +nextflow run nf-core/stableexpression \ + -latest \ + -c slurm.config \ + -profile apptainer \ + -resume \ + --params-file params.yaml +``` + +with `slurm.config`: + +``` +executor { + name = 'slurm' + queue = // if your HPC cluster uses partitions, use a partition including fast runs + queueSize = 50 // see https://seqera.io/blog/5_tips_for_hpc_users/ + submitRateLimit = '10 sec' // see https://seqera.io/blog/5_tips_for_hpc_users/ + cpus = 64 // adjust to your needs + memory = 400.GB // adjust to your needs + time = 48.h // optional, only if you want to limit the runtime +} +``` + +and `params.yaml`: + +``` +species: +outdir: +[+ OTHER PARAMETERS] +``` + +Run this script with `sbatch`: + +``` +sbatch launch_nf_core_stableexpression.sh +``` + +For checking the status of the run, we recommend tools like [slurmer](https://crates.io/crates/slurmer). diff --git a/docs/images/nf-core-stableexpression_logo_dark.png b/docs/images/nf-core-stableexpression_logo_dark.png index 1d474b6a..24d8da8b 100644 Binary files a/docs/images/nf-core-stableexpression_logo_dark.png and b/docs/images/nf-core-stableexpression_logo_dark.png differ diff --git a/docs/images/nf-core-stableexpression_logo_light.png b/docs/images/nf-core-stableexpression_logo_light.png index 7e25a84b..c4a8482e 100644 Binary files a/docs/images/nf-core-stableexpression_logo_light.png and b/docs/images/nf-core-stableexpression_logo_light.png differ diff --git a/docs/images/nf_core_stableexpression.metromap.drawio b/docs/images/nf_core_stableexpression.metromap.drawio new file mode 100644 index 00000000..e3953c65 --- /dev/null +++ b/docs/images/nf_core_stableexpression.metromap.drawio @@ -0,0 +1,337 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/nf_core_stableexpression.metromap.png b/docs/images/nf_core_stableexpression.metromap.png new file mode 100644 index 00000000..54e3df8e Binary files /dev/null and b/docs/images/nf_core_stableexpression.metromap.png differ diff --git a/docs/output.md b/docs/output.md index dee4caf2..38dd2b30 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,36 +1,137 @@ # nf-core/stableexpression: Output -## Introduction +## Pipeline reports (TLDR) -This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. +The main output of the pipeline is the MultiQC report, which summarises results at the end of the pipeline. This report is located at `/multiqc/multiqc_report.html` and can be opened in your favorite browser. -The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. +For advanced users who seek to explore more deeply the distributions of normalised counts gene per gene or sample per sample, a Dash Plotly app is readily prepared at the end of each pipeline run. See [here](#dash-plotly-app) for explanation on how to run the app. - +## Introduction -## Pipeline overview +This document describes the output produced by the pipeline. -The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +## Main output files ### MultiQC +This report is located at `multiqc/multiqc_report.html` and can be opened in a browser. +
Output files - `multiqc/` - - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - - `multiqc_plots/`: directory containing static images from the report in various formats. + - MultiQC report file: `multiqc_report.html`. + - MultiQC data dir: `multiqc_data`. + - Plots created by MultiQC: `multiqc_plots`.
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +### Dash Plotly app + +`dash_app/`: folder containing the Dash Plotly app + +To launch the app, you must first create and activate the appropriate conda environment: + +```bash +conda env create -n nf-core-stableexpression-dash -f /dash_app/spec-file.txt +conda activate nf-core-stableexpression-dash +``` + +then: + +``` +cd dash_app +python app.py +``` + +and open your browser at `http://localhost:8080` + +> [!NOTE] +> The app will try to use the port `8080` by default. If it is already in use, it will try `8081`, `8082` and so on. Check the logs to see which port it is using. + +### Statistics and scoring + +The gene stat summary is also bundled with the Dash Plotly app. + +
+Output files + +- `dash_app/data/all_genes_summary.csv`: file containing all gene statistics, scores and ranked by stability score + +
+ +### Merged data + +The file containing all normalised counts is bundled as a Parquet file with the Dash Plotly app. + +
+Output files + +- `dash_app/data/all_counts.imputed.parquet`: parquet file containing all normalised + imputed gene counts +- `idmapping/global_gene_metadata.csv`: table containing the complete set of gene metadata, obtained either via gProfiler or via the custom file provided by the user +- `idmapping/global_gene_id_mapping.csv`: table containing the complete set of gene id mapping, obtained either via gProfiler or via the custom file - - +- `merged_datasets/whole_design.csv`: table contained designs for all datasets and all samples comprised in the analysis + +
+ +## Other output files of interest (useful for debbuging) + +### Expression Atlas + +
+Output files + +- `public_data/expression_atlas/accessions/`: accessions found when querying Expression Atlas +- `public_data/expression_atlas/datasets/`: count datasets (normalized: `*.normalised.csv` / raw: `*.raw.csv`) and experimental designs (`*.design.csv`) downloaded from Expression Atlas. + +
+ +### GEO + +
+Output files + +- `public_data/geo/accessions/`: accessions found when querying GEO +- `public_data/geo/datasets/`: count datasets (normalized: `*.normalised.csv` / raw: `*.raw.csv`) and experimental designs (`*.design.csv`) downloaded from GEO. + +
+ +### IDMapping (g:Profiler) + +
+Output files + + - `renamed`: count datasets with renamed and filtered gene IDs + +
+ +### Normalisation + +
+Output files + +- `normalised/`: Newly normalised datasets + - `tpm/`: with TPM + - `cpm/`: with CPM +- `normalised/quantile_normalised` : Quantile normalised datasets + +### Genome annotation and gene length + +
+Output files + +- `gene_length/`: + - `gene_trnascript_lengths.csv`: table containing gene transcript lengths + - `*.gff*`: downloaded genome annotation + +
+ ### Pipeline information
@@ -39,7 +140,6 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ - `pipeline_info/` - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. - Parameters used by the pipeline run: `params.json`.
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md new file mode 100644 index 00000000..d68a0792 --- /dev/null +++ b/docs/troubleshooting.md @@ -0,0 +1,68 @@ +# nf-core/stableexpression: Troubleshooting + +## Error 139 on macOS + +If you are running the pipeline on macOS with containers (`docker`, `apptainer`, `singularity`, ...), you may encounter issues like: + +``` +NOTE: Process `NFCORE_STABLEEXPRESSION:STABLEEXPRESSION:ID_MAPPING:CLEAN_GENE_IDS ()` terminated with an error exit status (139) -- Execution is retried (1) +``` + +eventually leading to pipeline failure. + +This is likely due to the python polars library not being compatible with macOS when run inside a container. + +You should run the pipeline with `-profile micromamba` or `-profile conda`. + +## Ǹo dataset found + +For species that are not on Expression Atlas, the pipeline will not be able to find suitable datasets and will log the following message: + +``` +ERROR: Could not find any readily usable public dataset +... +``` + +> [!TIP] +> You can first try to have the pipeline fetch suitable datasets from NCBI GEO by providing the `--fetch_geo_accessions` flag. + +In case no datasets are found, you'll have to find a way to get count datasets and to prepare them for the pipeline. +A good start is to check in the folder `/public_data/geo/datasets/` if there are `rejected` subfolders. Such subfolders contain datasets that were downloaded (together with their experimental design) but failed to pass checks. Quite often, some of them be manually reprocessed to be suitable for the pipeline. + +Finally, you may want to check by yourself on [NCBI GEO](https://www.ncbi.nlm.nih.gov/gds). + +Alternatively, some public websites contain expression datasets that may be suitable for the pipeline, such as: + +- [Bgee](https://www.bgee.org/) + +## Not enough memory + +The pipeline limits the number of downloaded datasets to a certain number in order to limit RAM usage, especially for `homo sapiens`. + +However, on small computers, the limit may be too permissive and lead to RAM overhead. You can reduce the number of datasets downloaded by setting the `--random_sampling_size` to a lower value. + +## Why do I get only a fraction of the public datasets available on Expression Atlas or NCBI GEO? Give them back! + +To reduce the RAM overhead, the pipeline selects randomly a certain number of datasets, based on the number of samples they contain. To increase the number of collected datasets, you can increase the `--random_sampling_size` parameter. + +[!TIP] + +> A seed is also set in order to make the runs reproducible. You can change the subset of chosen datasets by changing the `--random_sampling_seed`. + +## The pipeline failed to find a genome annotation for the specified species + +If you know the length of the longest cDNA for each gene, you can provide gene lengths yourself with the `--gene_length` flag (see [Custom gene ID mapping / metadata / length](usage.md#5-custom-gene-id-mapping--metadata)). In case you do not have access to gene length, TPM normalisation cannot be formed. A fallback is to use CPM normalisation by setting `--normalisation_method cpm`. It will introduce a small bias towards long genes, but this should not result in big changes. + +## Java heap space + +In some cases, in particular when running the pipeline on a very large number of datasets (such as for `Homo sapiens`), the Nextflow Java virtual machines can start to request a large amount of memory. You may happen to see the following error: + +``` +java.lang.OutOfMemoryError: Java heap space +``` + +We recommend to increase the memory available to Java: + +```bash +export NXF_OPTS='-Xms1g -Xmx4g' +``` diff --git a/docs/usage.md b/docs/usage.md index 6ee23ca6..2bc85a3c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,65 +2,249 @@ ## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/stableexpression/usage](https://nf-co.re/stableexpression/usage) +> [!WARNING] +> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files). + +> [!TIP] +> For setting number of CPUs and memory used by the pipeline, or for instruction on how to run it on an HPC, see the [configuration instructions](configuration.md). + +> [!NOTE] +> In case of issues with the pipeline, please check the [troubleshooting page](troubleshooting.md) or [report a new issue](https://github.com/nf-core/stableexpression/issues). + > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ -## Introduction +## 1. Basic run + +This pipeline fetches Expression Atlas and GEO accessions for the provided species and downloads the corresponding data. + +```bash +nextflow run nf-core/stableexpression \ + -profile \ + --species \ + --outdir \ + -resume +``` + +> [!TIP] +> It is often a good practice to run the pipeline with the `-resume` flag. See the [Nextflow documentation on caching and resuming](https://www.nextflow.io/docs/latest/cache-and-resume.html) for more information. - +> [!NOTE] +> See [here](#profiles) for more information about profiles. -## Samplesheet input +## 2. Specific public datasets -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +You can provide keywords to restrict downloaded datasets to specific conditions. ```bash ---input '[path to samplesheet file]' +nextflow run nf-core/stableexpression \ + -profile \ + --species \ + --keywords + --outdir ``` -### Multiple runs of the same sample +> [!NOTE] +> +> - Multiple keywords must be separated by commas. +> - Please note that keywords are additive: you will get datasets that fit with **either of the provided keywords**. +> - A dataset will be downloaded if a keyword is found in its summary or in the same of a sample. +> - The natural language processing [`nltk`](https://www.nltk.org/) python package is used to find keywords as well as derived words. For example, the `leaf` keyword should match 'leaf', 'leaves', 'leafy', etc. + +## 3. Provide your own accessions -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +You may already have an idea of specific Expression Atlas / GEO accessions you want to use in the analysis. +In this case, you can provide them directly to the pipeline. -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz +```bash +nextflow run nf-core/stableexpression \ + -profile \ + --species \ + --skip_fetch_eatlas_accessions \ + [--eatlas_accessions ] \ + [--eatlas_accessions_file ] \ + [--geo_accessions ] \ + [--geo_accessions_file ] \ + --outdir ``` -### Full samplesheet +> [!WARNING] +> If you want to download only the datasets corresponding to the accessions supplied, you must set the `--skip_fetch_eatlas_accessions` parameter. -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +> [!NOTE] +> If you provide accessions through `--eatlas_accessions_file` or `--geo_accessions_file`, there must be one accession per line. The extension of the file does not matter. -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +In case you do not know which accessions you want but you would like to control precisely which datasets are included in you analysis, you may run first: -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +```bash +nextflow run nf-core/stableexpression \ + -profile \ + --species \ + --accessions_only \ + --outdir ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +Fetched accessions with their respective metadata will be available in `/expression_atlas/accessions/` and `/geo/accessions/` + +## 4. Use your own expression datasets -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +You can of course provide your own counts datasets / experimental designs. + +> [!NOTE] +> +> - To ensure all RNA-seq datasets are processed the same way, users should provide **raw counts**. +> - If normalised counts are provided, users should apply the same normalisation process to all of them. **The prefered method is `TPM`**. + +> [!WARNING] +> Microarray data must be already normalised. When mixing your own datasets with public ones in a single run, you should use the `RMA` method in order to be compliant with Expression Atlas and GEO datasets. + +First, prepare a CSV samplesheet listing the different count datasets you want to use. Each row represents a specific dataset and must contain: + +| Column | Description | +| ------------ | ----------------------------------------------------------------------------------------- | +| `counts` | Path to the count dataset (a CSV / TSV file) | +| `design` | Path to the experimental design associated to this dataset (a CSV / TSV file) | +| `platform` | Platform used to generate the counts (`rnaseq` or `microarray`) | +| `normalised` | Boolean (`true` / `false`) representing whether the counts are already normalised or not. | + +It should look as follows: + +```csv title=datasets.csv +counts,design,platform,normalised +path/to/normalised.counts.csv,path/to/normalised.design.csv,rnaseq,true +path/to/raw.counts.csv,path/to/raw.design.csv,rnaseq,false +path/to/microarray.counts.csv,path/to/microarray.design.csv,microarray,true +``` + +It can also be a YAML file: + +```yaml title=datasets.yaml +- counts: path/to/normalised.counts.csv + design: path/to/normalised.design.csv + platform: rnaseq + normalised: true +- counts: path/to/raw.counts.csv + design: path/to/raw.design.csv + platform: rnaseq + normalised: false +- counts: path/to/microarray.counts.csv + design: path/to/microarray.design.csv + platform: microarray + normalised: true +``` + +The counts should have the following structure: + +```csv title=counts.csv +gene_id,sample_A,sample_B,sample_C +gene_1,1,2,3 +gene_2,1,2,3 +``` + +While the design should look like: + +```csv title=design.csv +sample,condition +sample_A,condition_1 +sample_B,condition_2 +sample_C,condition_1 +``` + +> [!WARNING] +> +> - In the count file, the first header column (corresponding to gene IDs) should not be empty. However, its name can be anything. +> - The count file should not have any column other than the first one (gene IDs) and the sample columns. -## Running the pipeline +> [!TIP] +> Both counts and design files can also be supplied as TSV files. -The typical command for running the pipeline is as follows: +Now run the pipeline with: ```bash -nextflow run nf-core/stableexpression --input ./samplesheet.csv --outdir ./results -profile docker +nextflow run nf-core/stableexpression \ + -profile \ + --species \ + --datasets \ + --skip_fetch_eatlas_accessions \ + --outdir +``` + +> [!TIP] +> The `--skip_fetch_eatlas_accessions` parameter is supplied here to show how to analyse **only your own dataset**. You may remove this parameter if you want to mix you dataset(s) with public ones. + +> [!IMPORTANT] +> By default, the pipeline tries to map gene IDs to Ensembl gene IDs. **All genes that cannot be mapped are discarded from the analysis**. This ensures that all genes are named the same between datasets and allows comparing multiple datasets with each other. If you are confident that your genes have the same name between your different datasets or if you think on the contrary that your gene IDs just won't be mapped properly, you can disable this mapping by adding the `--skip_id_mapping` parameter. In such case, we recommend users to supply their own gene id mapping and gene metadata files using the `--gene_id_mapping` and `--gene_metadata` parameters respectively. +> +> Both files are totally optional, however: +> - a custom gene id mapping might help merging datasets properly +> - custom gene metadata (association between gene id, gene name and gene description) will supply relevant metadata in the final MultiQC report +> +> See [next section](#5-custom-gene-id-mapping--metadata) for further details. + +> [!TIP] +> You can check if your gene IDs can be mapped using the [g:Profiler server](https://biit.cs.ut.ee/gprofiler/convert). + +### 5. Custom gene ID mapping / metadata + +You can supply your own gene ID mapping and / or gene metadata with the `--gene_id_mapping` and `--gene_metadata` parameters respectively. The gene ID mapping file is used to map gene IDs in count table(s) (local or downloaded) to more generic IDs that will be used as a basis for subsequent steps. The gene metadata file provides additional information about the genes, such as their common name and description. + +Structure of the gene id mapping file: + +| Column | Description | +| ------------------ | --------------------------------------------- | +| `original_gene_id` | Gene ID used in the provided count dataset(s) | +| `gene_id` | Mapped gene ID | + +Example: + +```csv title=gene_id_mapping.csv +original_gene_id,gene_id +gene_A,ENSG1234567890 +geneB,OTHERmappedgeneID +``` + +Structure of the gene metadata file: + +| Column | Description | +| ------------- | ---------------- | +| `gene_id` | Mapped gene ID | +| `name` | Gene common name | +| `description` | Gene description | + +Example: + +```csv title=gene_metadata.csv +gene_id,name,description +ENSG1234567890,Gene A,Description of gene A +OTHERmappedgeneID,My OTHER Gene,Another description ``` -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +### 6. Custom gene annotation / gene length + +For the computation of TPM values during gene expression normalisation, the knowledge of gene length is required. In the case where the species of interest does not have a public annotation, or if you are encountering network issues, you can supply directly either your own genome annotation or a file associating gene ids to gene lengths with the `--gff` and `--gene_length` parameters respectively. + +The genome annotation must be in `GFF` format and have the `.gff` extension. You can use the [`AGAT`](https://github.com/NBISweden/AGAT) package to convert other genome annotation formats to `GFF`. + +The gene length file must be in `CSV` or `TSV` format and have the following structure: + +| Column | Description | +| --------- | -------------------------------- | +| `gene_id` | Mapped gene ID | +| `length` | Gene length (longest transcript) | + +Example: + +```csv title=gene_length.csv +gene_id,length +ENSG1234567890,1000 +OTHERmappedgeneID,2000 +``` + + +### 7. More advanced scenarios + +For advanced scenarios, you can see the list of available parameters in the [parameter documentation](https://nf-co.re/stableexpression/parameters). + +## Pipeline output Note that the pipeline will create the following files in your working directory: @@ -71,6 +255,10 @@ work # Directory containing the nextflow working files # Other nextflow hidden files, eg. history of pipeline runs and old logs. ``` +For a detailed description of the output files, please consult the [nf-core stableexpression output directory structure](https://nf-co.re/stableexpression/output). + +## Parameters + If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. @@ -81,13 +269,14 @@ Pipeline settings can be provided in a `yaml` or `json` file via `-params-file < The above pipeline run specified with a params file in yaml format: ```bash -nextflow run nf-core/stableexpression -profile docker -params-file params.yaml +nextflow run -r dev nf-core/stableexpression -profile docker -params-file params.yaml ``` with: ```yaml title="params.yaml" -input: './samplesheet.csv' +species: 'Homo sapiens' +datasets: './datasets.csv' outdir: './results/' <...> ``` @@ -120,14 +309,35 @@ To further assist in reproducibility, you can use share and reuse [parameter fil > [!NOTE] > These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen) -### `-profile` +### [`-profile`](#profiles) Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. > [!IMPORTANT] -> We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. +> We highly recommend the use of Apptainer (Singularity) or Docker containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. + +> [!TIP] + +> When running the pipeline of multi-user server or on a cluster, the best practice is to use Apptainer (formerly Singularity). You can install Apptainer by following these [instructions](https://apptainer.org/docs/admin/main/installation.html#). +> In case you encounter the following error when running Apptainer: +> +> ``` +> ERROR : Could not write info to setgroups: Permission denied +> ERROR : Error while waiting event for user namespace mappings: no event received +> ``` +> +> you may need to install the `apptainer-suid` package instead of `apptainer`: +> +> ``` +> # Debian / Ubuntu +> sudo apt install apptainer-suid +> # RHEL / CentOS +> sudo yum install apptainer-suid +> # Fedora +> sudo dnf install apptainer-suid +> ``` The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to check if your system is supported, please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). @@ -139,6 +349,8 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - `test` - A profile with a complete configuration for automated testing - Includes links to test data so needs no other parameters +- `apptainer` + - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) - `docker` - A generic configuration profile to be used with [Docker](https://docker.com/) - `singularity` @@ -149,12 +361,12 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) - `charliecloud` - A generic configuration profile to be used with [Charliecloud](https://charliecloud.io/) -- `apptainer` - - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) - `wave` - A generic configuration profile to enable [Wave](https://seqera.io/wave/) containers. Use together with one of the above (requires Nextflow ` 24.03.0-edge` or later). - `conda` - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer. +- `micromamba` + - A faster, more lightweight alternative to Conda. As for Conda, use Micromamba as a last resort. ### `-resume` diff --git a/galaxy/README.md b/galaxy/README.md new file mode 100644 index 00000000..3aaf6761 --- /dev/null +++ b/galaxy/README.md @@ -0,0 +1,96 @@ +# Galaxy + +The following instructions need to performed for each release + +>[!TIP] +>For the first time setup of Galaxy for you Nextflow pipeline, see the [setup instructions](setup.md) + +## Activate environment + +>[!NOTE] +>If you're planemo environment is not set up, see the [setup instructions](setup.md) + +Activate your planemo environment: +``` +micromamba activate planemo +``` + +## At each release: build XML file + +### Optional: modify static values in template file + +If needed, you can: +- update the versions of core dependencies (Nextflow, Micromamba, OpenJDK) +- modify outputs +- modify tests + +>[!NOTE] +>The versions of core dependencies (Nextflow, Micromamba, OpenJDK) are not updated automatically, although the code necessary for this is already implemented. +>For now, we want to keep control over the versions used, to avoid versions that may contain bugs. + +### Update tool + +Update the tool XML file: +``` +python build/build_tool.py +``` + +This script will fetch : + +- all the parameters in your nextflow_schema.json +- the latest version of Nextflow, Singularity and OpenJDK in Conda channels. + +and modify the XML file located at `galaxy/tool_shed/tool/nf_core_stableexpression.xml`. + +Your tool is ready to be used! + +## Test tool + +### Launch local Galaxy server + +You may want to have a first look at what your tool looks like in the Galaxy interface. +To launch a local instance of Galaxy with your tool already installed: + +``` +./serve +``` + +You can test the behaviour of your tool by providing different inputs and check the corrsponding output. + +### Linting and testing + +To lint your tool: + +``` +./lint +``` + +>[!WARNING] +>The test script is not working for now... Planemo does not seem to find the input data for testing... +>For the moment, testing in a local webserver and linting using the provided script should be sufficient. + +To test your tool: + +``` +./test +``` + +## Publishing to the Galaxy Toolshed + + +``` +cd tool_shed +``` + +### Optional: test update on the test Toolshed + +If you have already set up an account on the test Toolshed, you can test the update of your tool: +``` +planemo shed_update --shed_target toolshed +``` + +### Official Galaxy Toolshed + +``` +planemo shed_update --shed_target toolshed +``` diff --git a/galaxy/build/build_template.py b/galaxy/build/build_template.py new file mode 100755 index 00000000..0a163465 --- /dev/null +++ b/galaxy/build/build_template.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 + +import logging +from pathlib import Path + +from formatters import ConfigFormatter + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +STATIC_TOOL_FILENAME = Path(__file__).parent / "static/template.xml" +BOILERPLATE_FILENAME = Path(__file__).parent / "static/template.boilerplate.xml" + + +def main(): + logger.info("Parsing config") + pipeline_metadata = ConfigFormatter.get_pipeline_metadata() + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # REPLACING ACTUAL PARAMS IN STATIC TOOL + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + with open(BOILERPLATE_FILENAME, "r") as fin: + boilerplate_string = fin.read() + + pipeline_name = pipeline_metadata["name"].replace("nf-core/", "") + + logger.info("Building template XML file") + template_string = boilerplate_string.replace("PIPELINE_NAME", pipeline_name) + + with open(STATIC_TOOL_FILENAME, "w") as fout: + fout.write(template_string) + + logger.info("Done") + + +if __name__ == "__main__": + main() diff --git a/galaxy/build/build_tool.py b/galaxy/build/build_tool.py new file mode 100755 index 00000000..ef8e8432 --- /dev/null +++ b/galaxy/build/build_tool.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 + +import logging +from pathlib import Path + +from formatters import ConfigFormatter, SchemaFormatter + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +tool_boilerplate_file = Path(__file__).parent / "static/template.xml" +tool_file = Path(__file__).parents[1] / "tool_shed/tool/nf_core_{}.xml" + + +def main(): + logger.info("Formatting config") + # package_versions = ConfigFormatter.get_package_versions() + pipeline_metadata = ConfigFormatter.get_pipeline_metadata() + + logger.info("Formatting schema") + schema_formatter = SchemaFormatter() + + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # REPLACING ACTUAL PARAMS IN STATIC TOOL + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + with open(tool_boilerplate_file, "r") as fin: + static_string = fin.read() + + # checking if package versions were filled by the user + for package_version in ["OPENJDK_VERSION"]: + if package_version in static_string: + raise ValueError( + f"You must fill the package version in place of {package_version} before building" + ) + + logger.info("Building tool XML file") + tool_string = ( + static_string + # .replace("NEXTFLOW_VERSION", package_versions["nextflow"]) + # .replace("APPTAINER_VERSION", package_versions["apptainer"]) + # .replace("OPENJDK_VERSION", package_versions["openjdk"]) + .replace("PIPELINE_VERSION", pipeline_metadata["version"]) + .replace("DESCRIPTION", schema_formatter.pipeline_description) + .replace("PARAMETERS", schema_formatter.params_cli) + .replace("INPUTS", schema_formatter.inputs) + ) + + pipeline_name = pipeline_metadata["name"].replace("nf-core/", "") + outfile = Path(str(tool_file).format(pipeline_name)) + with open(outfile, "w") as fout: + fout.write(tool_string) + + logger.info("Done") + + +if __name__ == "__main__": + main() diff --git a/galaxy/build/formatters/__init__.py b/galaxy/build/formatters/__init__.py new file mode 100644 index 00000000..4f46ac15 --- /dev/null +++ b/galaxy/build/formatters/__init__.py @@ -0,0 +1,4 @@ +from .schema.base import SchemaFormatter +from .config.base import ConfigFormatter + +__all__ = ["SchemaFormatter", "ConfigFormatter"] diff --git a/galaxy/build/formatters/config/base.py b/galaxy/build/formatters/config/base.py new file mode 100644 index 00000000..3e7fc16d --- /dev/null +++ b/galaxy/build/formatters/config/base.py @@ -0,0 +1,89 @@ +import logging +import re +from dataclasses import dataclass +from pathlib import Path +from typing import ClassVar + +import requests +from packaging.version import parse as vparse + +logger = logging.getLogger(__name__) + + +@dataclass +class BaseConfigFormatter: + """ + Base class for extracting metadata from the pipeline's config files. + """ + + CONFIG_FILE: ClassVar[Path] = Path(__file__).parents[4] / "nextflow.config" + MAIN_FILE: ClassVar[Path] = Path(__file__).parents[4] / "main.nf" + PACKAGES_REPOS: ClassVar[dict] = { + "nextflow": "bioconda", + "micromamba": "conda-forge", + "openjdk": "conda-forge", + } + + @classmethod + def get_package_versions(cls) -> dict: + # CONDA PACKAGE VERSIONS + package_version = {} + for package, repo in cls.PACKAGES_REPOS.items(): + package_version[package] = cls.get_package_version(package, repo) + return package_version + + @staticmethod + def get_package_version(package: str, repo: str) -> str: + """ + Get latest pip version of package + """ + logger.info(f"Getting latest version of package {package}") + url = f" https://api.anaconda.org/package/{repo}/{package}" + try: + response = requests.get(url) + response.raise_for_status() + data = response.json() + versions = sorted( + data["versions"], reverse=True, key=vparse + ) # from latest to oldest + return versions[0] # most recent + except requests.RequestException as e: + raise RuntimeError(f"Error fetching version info: {e}") + + @classmethod + def get_pipeline_metadata(cls) -> dict: + # PARSING CONFIG + with open(cls.CONFIG_FILE, "r") as f: + pipeline_config = f.read() + + # regular expression to find the manifest block and extract the version + manifest_pattern = re.compile(r"manifest\s*{\s*(.*?)\s*}", re.DOTALL) + manifest_match = manifest_pattern.search(pipeline_config) + version = None + name = None + + if manifest_match: + manifest_content = manifest_match.group(1) + + # regular expression to find the version field + name_pattern = re.compile(r'name\s*=\s*[\'"](.*?)[\'"]') + name_match = name_pattern.search(manifest_content) + if name_match: + name = name_match.group(1) + else: + raise ValueError("No name found in pipeline config") + + # regular expression to find the version field + version_pattern = re.compile(r'version\s*=\s*[\'"](.*?)[\'"]') + version_match = version_pattern.search(manifest_content) + if version_match: + version = version_match.group(1) + else: + raise ValueError("No version found in pipeline config") + + return dict(name=name, version=version) + + +@dataclass +class ConfigFormatter(BaseConfigFormatter): + pass diff --git a/galaxy/build/formatters/schema/base.py b/galaxy/build/formatters/schema/base.py new file mode 100644 index 00000000..50a702f0 --- /dev/null +++ b/galaxy/build/formatters/schema/base.py @@ -0,0 +1,98 @@ +from pathlib import Path +import json +from dataclasses import dataclass, field +from typing import ClassVar +from . import parameter + + +@dataclass +class SchemaFormatter: + SCHEMA_FILE: ClassVar[Path] = Path(__file__).parents[4] / "nextflow_schema.json" + PARAMS_TO_IGNORE: ClassVar[list] = ["outdir", "email", "multiqc_title"] + SECTIONS_TO_IGNORE: ClassVar[list] = [ + "institutional_config_options", + "generic_options", + ] + SECTIONS_TO_EXPAND: ClassVar[list] = ["input_output_options"] + + pipeline_description: str = field(init=False) + inputs: str = field(init=False) + params_cli: str = field(init=False) + _pipeline_params: dict = field(init=False) + + _inputs: list = field(init=False, default_factory=list) + _params_cli: list = field(init=False, default_factory=list) + + def __post_init__(self): + self.parse_schema_file() + + def parse_schema_file(self): + with open(self.SCHEMA_FILE, "r") as f: + pipeline_schema = json.load(f) + + self.pipeline_description = pipeline_schema["description"].strip("\n") + self._pipeline_params = pipeline_schema["$defs"] + + # PARSING PARAMETERS AND BUILDING STRINGS + for section, section_dict in self._pipeline_params.items(): + if section in self.SECTIONS_TO_IGNORE: + continue + + section_inputs, section_params_cli, section_usage_options = ( + self.format_input_section(section, section_dict) + ) + + self._inputs += section_inputs + self._params_cli += section_params_cli + + self.inputs = "\n".join(self._inputs) + self.params_cli = "\n".join(self._params_cli) + + def format_input_section( + self, section: str, section_dict: dict + ) -> tuple[list, list, list]: + section_inputs = [] + section_params_cli = [] + section_usage_options = [] + + section_title = "" + section_help = "" + + if title := section_dict.get("title"): + section_title = f' title="{title}"' + if description := section_dict.get("description"): + section_help = f' help="{description}"' + + section_expanded = ( + ' expanded="true"' + if section in self.SECTIONS_TO_EXPAND + else ' expanded="false"' + ) + + section_inputs.append( + f'\t\t
' + ) + section_usage_options.append("\n\t" + section.capitalize().replace("_", " ")) + + required_params = section_dict.get("required", []) + + for param, param_dict in section_dict["properties"].items(): + if param not in self.PARAMS_TO_IGNORE: + optional = param not in required_params + + # checking if param must be parsed in a generic or in a custom way + if param in parameter.PARAMETER_TO_CUSTOM_CLASS: + class_ = parameter.PARAMETER_TO_CUSTOM_CLASS[param] + else: + class_ = parameter.BaseParameterFormatter + + param_formatter = class_(param, section, param_dict, optional) + + # input arguments + section_inputs.append(param_formatter.get_input()) + # cli + section_params_cli.append(param_formatter.get_cli()) + + section_inputs.append("\t\t
") + + return section_inputs, section_params_cli, section_usage_options diff --git a/galaxy/build/formatters/schema/parameter/__init__.py b/galaxy/build/formatters/schema/parameter/__init__.py new file mode 100644 index 00000000..2708d3be --- /dev/null +++ b/galaxy/build/formatters/schema/parameter/__init__.py @@ -0,0 +1,14 @@ +from .base import BaseParameterFormatter +from .datasets import DatasetsParameterFormatter +from .required import RequiredParameterFormatter + +# from .default_value import DefaultValueParameterFormatter + +PARAMETER_TO_CUSTOM_CLASS = { + "datasets": DatasetsParameterFormatter, + "normalisation_method": RequiredParameterFormatter, + "nb_top_gene_candidates": RequiredParameterFormatter, + # "species": DefaultValueParameterFormatter, +} + +__all__ = ["BaseParameterFormatter"] diff --git a/galaxy/build/formatters/schema/parameter/base.py b/galaxy/build/formatters/schema/parameter/base.py new file mode 100644 index 00000000..87402b0c --- /dev/null +++ b/galaxy/build/formatters/schema/parameter/base.py @@ -0,0 +1,201 @@ +from dataclasses import dataclass +from typing import ClassVar, override + + +@dataclass +class Validator: + """ """ + + PATTERN: ClassVar[str] = ( + '\t\t\t{expression}\n' + ) + + type: str + message: str + expression: str + + @override + def __str__(self): + return self.PATTERN.format( + type=self.type, message=self.message, expression=self.expression + ) + + +@dataclass +class Option: + """ + Represents an option for a parameter. + + Attributes: + value (str): The value of the option. + default_value (str): The default value of the option. + optional (bool): Whether the option is optional. + """ + + PATTERN: ClassVar[str] = ( + '\t\t\t\n' + ) + + value: str + default_value: str | None + optional: bool + + @override + def __str__(self): + selected_arg = ' selected="true"' if self.value == self.default_value else "" + return self.PATTERN.format( + option=self.value, label=self.value.capitalize(), selected_arg=selected_arg + ) + + +@dataclass +class BaseParameterFormatter: + NF_TYPES_TO_GALAXY: ClassVar[dict] = { + "string": "text", + "boolean": "boolean", + "integer": "integer", + "number": "float", + } + BASE_INPUT_PARAM: ClassVar[str] = ( + '\t\t\t' + ) + + param: str + section: str + param_dict: dict + optional: bool + + @staticmethod + def enrich_input_param(input_param_str: str, args: list[str]) -> str: + # opening param for enrichment + input_param_str = input_param_str.replace(" />", ">\n") + # adding each arg in a separate line + for arg in args: + input_param_str += "\t" + arg + # closing + input_param_str += "\t\t\t" + return input_param_str + + @staticmethod + def extract_extensions(extension_str: str): + def clean_extension(ext: str) -> str: + ext = ext.strip().lower() + if ext == "yml": + return "yaml" + return ext + + # removing the .dat extension, that is only used in the pipeline + # in order to allow files from the Galaxy file system (all renamed in .dat) + base_extensions = [ext for ext in extension_str.split("|") if ext != "dat"] + # Galaxy does not allow 'yml', only 'yaml' + return list(set([clean_extension(ext) for ext in base_extensions])) + + def process_file_param(self): + input_type = "data" + # removing extension check as files are renamed in .dat files by Galaxy + if pattern := self.param_dict.get( + "pattern" + ): # going from something like "^\\S+\\.(csv|yaml)$" to "csv,ya + # getting the extensions part + extension_str = pattern.split(".")[-1] + # removes recursively all leading and traling "(", ")" and "$" + extension_str = extension_str.strip("$()") + # getting list of extensions; removing dat because this extension is specifically made to handle Galaxy filename + formated_extensions_str = ",".join(self.extract_extensions(extension_str)) + param_format = f' format="{formated_extensions_str}"' + else: + # there is no specific pattern provided in the schema, this means that the format does not matter much + # however, the planemo linter needs a format, so we specify format="data" + param_format = ' format="data"' + return input_type, param_format + + def get_input(self) -> str: + """ + building input param + """ + + # making copy of base input param string + input_param_str = self.BASE_INPUT_PARAM + + param_format = "" + param_label = "" + param_help = "" + param_true_false = "" + param_value = "" + param_min = "" + param_max = "" + param_optional = ' optional="true"' if self.optional else ' optional="false"' + + param_type = self.param_dict["type"] + default_value = self.param_dict.get("default") + + # special case when parameter is a file + if param_type == "string" and self.param_dict.get("format") == "file-path": + input_type, param_format = self.process_file_param() + + # all other types + else: + input_type = self.NF_TYPES_TO_GALAXY[param_type] + + if param_type == "boolean": + param_true_false = f' truevalue="--{self.param}" falsevalue=""' + + elif param_type in ["integer", "number"]: + if minimum := self.param_dict.get("minimum"): + param_min = f' min="{minimum}"' + if maximum := self.param_dict.get("maximum"): + param_max = f' max="{maximum}"' + + elif param_type == "string": + # if there is a pattern for this string, we need to enrich this XML section with a validator + # TODO: handle (rare) case where bot enum and pattern are given + if pattern := self.param_dict.get("pattern"): # regex + msg = f"must match regular expression {pattern}" + validator = Validator(type="regex", message=msg, expression=pattern) + input_param_str = self.enrich_input_param( + input_param_str, args=[str(validator)] + ) + + # handle parameter with enum (options) + if option_values := self.param_dict.get("enum"): + input_type = "select" + options = [ + Option(value, default_value, self.optional) for value in option_values + ] + input_param_str = self.enrich_input_param( + input_param_str, args=[str(option) for option in options] + ) + + else: + if default_value is not None: + param_value = f' value="{default_value}"' + + if description := self.param_dict.get("description"): + param_label = f'label="{description}"' + if help_text := self.param_dict.get("help_text"): + param_help = f' help="{help_text}"' + + return input_param_str.format( + param=self.param, + type=input_type, + label=param_label, + format=param_format, + value=param_value, + min=param_min, + max=param_max, + true_false=param_true_false, + help=param_help, + optional=param_optional, + ) + + def get_cli(self) -> str: + # extra quotes if string parameter + value = ( + f'"${self.section}.{self.param}"' + if self.param_dict["type"] == "string" + else f"${self.section}.{self.param}" + ) + if self.optional: + return f"\t\t\t#if ${self.section}.{self.param}\n\t\t\t --{self.param} {value}\n\t\t\t#end if" + else: + return f"\t\t\t--{self.param} {value}" diff --git a/galaxy/build/formatters/schema/parameter/datasets.py b/galaxy/build/formatters/schema/parameter/datasets.py new file mode 100644 index 00000000..be28002d --- /dev/null +++ b/galaxy/build/formatters/schema/parameter/datasets.py @@ -0,0 +1,47 @@ +import re +from dataclasses import dataclass +from typing import override + +from .base import BaseParameterFormatter + + +@dataclass +class DatasetsParameterFormatter(BaseParameterFormatter): + # if param is an optional file with multiple possible values, it requires special handling + # see https://docs.galaxyproject.org/en/latest/dev/schema.html#id51 + + @override + def get_input(self) -> str: + input_param_str = super().get_input() + # setting to required + # changing param name + input_param_str = input_param_str.replace( + 'optional="true"', 'optional="false"' + ).replace(self.param, "samplesheet") + # changing label + input_param_str = re.sub( + r'label="[\s\w]*"', 'label="Samplesheet"', input_param_str + ) + + # adding conditional statement + return f""" \t\t\t + + + + + + {input_param_str} + + + + + + """ + + @override + def get_cli(self) -> str: + # see https://planemo.readthedocs.io/en/latest/writing_advanced.html#consuming-collections + return f""" + \t#if ${self.section}.datasets.provide_datasets == "true": + \t\t--datasets renamed_samplesheet.csv + \t#end if""" diff --git a/galaxy/build/formatters/schema/parameter/default_value.py b/galaxy/build/formatters/schema/parameter/default_value.py new file mode 100644 index 00000000..e141f461 --- /dev/null +++ b/galaxy/build/formatters/schema/parameter/default_value.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass +from .base import BaseParameterFormatter + + +@dataclass +class DefaultValueParameterFormatter(BaseParameterFormatter): + def __post_init__(self): + self.param_dict["default"] = "Solanum tuberosum" diff --git a/galaxy/build/formatters/schema/parameter/required.py b/galaxy/build/formatters/schema/parameter/required.py new file mode 100644 index 00000000..52cdbb82 --- /dev/null +++ b/galaxy/build/formatters/schema/parameter/required.py @@ -0,0 +1,8 @@ +from dataclasses import dataclass +from .base import BaseParameterFormatter + + +@dataclass +class RequiredParameterFormatter(BaseParameterFormatter): + def __post_init__(self): + self.optional = False diff --git a/galaxy/build/static/template.boilerplate.xml b/galaxy/build/static/template.boilerplate.xml new file mode 100644 index 00000000..f2429c20 --- /dev/null +++ b/galaxy/build/static/template.boilerplate.xml @@ -0,0 +1,53 @@ + + DESCRIPTION + + nextflow + apptainer + openjdk + + + + + +INPUTS + + + + + + + + + @misc{nf-core/PIPELINE_NAME, + author = {}, + year = {}, + title = {nf-core/PIPELINE_NAME}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/nf-core/PIPELINE_NAME}, + } + + + diff --git a/galaxy/build/static/template.xml b/galaxy/build/static/template.xml new file mode 100644 index 00000000..d8035286 --- /dev/null +++ b/galaxy/build/static/template.xml @@ -0,0 +1,171 @@ + + DESCRIPTION + + nextflow + micromamba + openjdk + + + + + +INPUTS + + + + + + + + +
+ + + + + + + +
+
+ +
+ + + + + + + + + + +
+ +
+ + + + + +
+ + + + + + + + + + +
+ +
+ + + + + + + +
+ + + + + + + + + + +
+
+ + + + @misc{nf-core/stableexpression, + author = {Coen, Olivier}, + year = {2025}, + title = {nf-core/stableexpression}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/OlivierCoen/stableexpression}, + } + + +
diff --git a/galaxy/dev/nextflow_apptainer.xml b/galaxy/dev/nextflow_apptainer.xml new file mode 100644 index 00000000..27f1f851 --- /dev/null +++ b/galaxy/dev/nextflow_apptainer.xml @@ -0,0 +1,34 @@ + + This pipeline is dedicated to finding the most stable genes across count datasets + + nextflow + apptainer + fuse-overlayfs + openjdk + + + results/species.txt + + && zip -r results.zip results + + ]]> + + + + + + + + diff --git a/galaxy/environment.yml b/galaxy/environment.yml new file mode 100644 index 00000000..feae659f --- /dev/null +++ b/galaxy/environment.yml @@ -0,0 +1,10 @@ +name: planemo +channels: + - defaults + - conda-forge + - bioconda + - nodefaults +dependencies: + - python=3.12 + - pip: + - planemo==0.75.33 diff --git a/galaxy/lint b/galaxy/lint new file mode 100755 index 00000000..dd141d2f --- /dev/null +++ b/galaxy/lint @@ -0,0 +1,8 @@ +#!/bin/bash + +galaxy_dir="$(dirname $(readlink -f "$0"))" +tool_file="${galaxy_dir}/tool_shed/tool/nf_core_stableexpression.xml" + +planemo lint $tool_file + +planemo shed_lint tool_shed/tool --tools diff --git a/galaxy/serve b/galaxy/serve new file mode 100755 index 00000000..019e9ee0 --- /dev/null +++ b/galaxy/serve @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +galaxy_dir="$(dirname $(readlink -f "$0"))" +tool_dir="${galaxy_dir}/tool_shed/tool" + +planemo serve \ + $tool_dir + +# add --no_cleanup to keep the pipelines workdirs after a run +# very useful for debugging diff --git a/galaxy/setup.md b/galaxy/setup.md new file mode 100644 index 00000000..d6e28afc --- /dev/null +++ b/galaxy/setup.md @@ -0,0 +1,60 @@ +# First time setup + +>[!NOTE] +>The following instructions need only to be performed once when you want to initialise your Galaxy tool and your repository on the Galaxy Toolshed. + +## Setup build / testing environment + +Create a new environment with python and planemo installed: +``` +micromamba env create -f environment.yml -y +micromamba activate planemo +``` + +## Initialise Galaxy tool boilerplate + +The XML definition file is partially generated dynamically by: + +- parsing nextflow_schema.json +- fetching latest version of Nextflow, Singularity and OpenJDK in Conda channels + +However, you need to build a boilerplate file with things that cannot be directly interpreted from nextflow_schema.json, such as: + +- path to selected output files +- tests +- specific conditions for the inputs + +### Build template XML file + +``` +python build/build_boilerplate.py +``` + +The boilerplate XML file is generated at `galaxy/build/static/boilerplate.xml`. + +### Customise template XML file + +You must edit the boilerplate XML file to add your customisations: + +- Mandatory (at least if your pipeline uses a samplesheet): modify file paths in the samplesheet + Galaxy has its own path system, and you must retrieve dynamically the paths of the files provided, in order to modify them in the samplesheet + "Running the pipeline" + In this cas, add "&&" before "nextflow drop ..." + +- modify outputs +- add tests + +## Create repository on Toolshed + +All necessary instructions are available in the [Galaxy Toolshed documentation](https://planemo.readthedocs.io/en/master/publishing.html). + +For now, you just need to : +- [configure a shed account](https://planemo.readthedocs.io/en/master/publishing.html#configuring-a-shed-account) +- [create a new repository on the Toolshed](https://planemo.readthedocs.io/en/master/publishing.html#creating-a-repository) + +Create a new folder for your tool and place the .shed.yml file in it: + +``` +mkdir -p tool_shed/tool +mv .shed.yml tool_shed +``` diff --git a/galaxy/test b/galaxy/test new file mode 100755 index 00000000..c11e3e34 --- /dev/null +++ b/galaxy/test @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +ARGS="$@" + +GALAXY_DIR="$(dirname $(readlink -f "$0"))" +TOOL_FOLDER="${GALAXY_DIR}/tool_shed/tool" +TOOL_FILE="${TOOL_FOLDER}/nf_core_stableexpression.xml" + + +TEST_OUTDIR="test_output" +mkdir -p $TEST_OUTDIR + +OUTPUT_REPORT="${TEST_OUTDIR}/report.html" +OUTPUT_JSON="${TEST_OUTDIR}/report.json" + +# add --update_test_data to create output file +planemo test \ + $TOOL_FILE \ + --install_galaxy \ + --job_output_files $TEST_OUTDIR \ + --test_output $OUTPUT_REPORT \ + --test_output_json $OUTPUT_JSON \ + --tool_data $TOOL_FOLDER \ + --update_test_data \ + $ARGS diff --git a/galaxy/tool_shed/.shed.yml b/galaxy/tool_shed/.shed.yml new file mode 100644 index 00000000..3c5b94c7 --- /dev/null +++ b/galaxy/tool_shed/.shed.yml @@ -0,0 +1,13 @@ +categories: + - Transcriptomics + - RNA + - Micro-array Analysis +description: Pipeline dedicated to finding the most stable genes across count datasets +homepage_url: https://nf-co.re/stableexpression/ +long_description: | + nf-core/stableexpression is a bioinformatics pipeline that aims at finding the most stable genes among a single or multiple public / local count datasets. + It takes as input a species name (mandatory), keywords for expression atlas search (optional) and / or a CSV input file listing local raw / normalised count datasets (optional). + A typical usage is to find the most suitable qPCR housekeeping genes for a specific species (and optionally specific conditions). +name: nf_core_stableexpression +owner: ocoen +remote_repository_url: https://github.com/nf-core/stableexpression/ diff --git a/galaxy/tool_shed/tool/nf_core_stableexpression.xml b/galaxy/tool_shed/tool/nf_core_stableexpression.xml new file mode 100644 index 00000000..27ccaa5e --- /dev/null +++ b/galaxy/tool_shed/tool/nf_core_stableexpression.xml @@ -0,0 +1,648 @@ + + This pipeline is dedicated to identifying the most stable genes within a single or multiple expression dataset(s). This is particularly useful for identifying the most suitable RT-qPCR reference genes for a specific species. + + nextflow + micromamba + openjdk + + + + + +
+ + ^([a-zA-Z]+)[_ ]([a-zA-Z]+)[_ a-zA-Z]*$ + + + + + + + + + + + + + + + + (([a-zA-Z,]+))? + + + + + + + + + +
+
+ + + + ([A-Z0-9-]+,?)+ + + + + ([A-Z0-9-]+,?)+ + + +
+
+ + + + + + + + + + + + +
+
+ + + +
+
+ + + + + + + + + + + + + + + +
+
+ + + + + ^\d+(\.\d+)?,\d+(\.\d+)?,\d+(\.\d+)?,\d+(\.\d+)?$ + +
+
+ + +
+
+ + + + + + + +
+ + + + + + + +
+
+ +
+ + + + + + + + + + +
+ +
+ + + + + +
+ + + + + + + + + + +
+ +
+ + + + + + + +
+ + + + + + + + + + +
+
+ + + + @misc{nf-core/stableexpression, + author = {Coen, Olivier}, + year = {2025}, + title = {nf-core/stableexpression}, + publisher = {GitHub}, + journal = {GitHub repository}, + url = {https://github.com/OlivierCoen/stableexpression}, + } + + +
diff --git a/galaxy/tool_shed/tool/rebuild_samplesheet.py b/galaxy/tool_shed/tool/rebuild_samplesheet.py new file mode 100644 index 00000000..43666e66 --- /dev/null +++ b/galaxy/tool_shed/tool/rebuild_samplesheet.py @@ -0,0 +1,70 @@ +#!/usr/env/bin python +""" +Script dedicated to renaming files in the samplesheet provided. +In Galaxy, data files provided by users are given a new file name. +However, original file names can be retrieved from the name attribute of the file object (inside the tool XML file). +In this script, we replace the original name with the actual Galaxy path. + +""" + +import argparse +import logging +from pathlib import Path +import csv + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--in", dest="samplesheet", type=Path, required=True) + parser.add_argument("--count-files", dest="count_files", type=str, required=True) + parser.add_argument( + "--count-filenames", dest="count_filenames", type=str, nargs="+", required=True + ) + parser.add_argument("--design-files", dest="design_files", type=str, required=True) + parser.add_argument( + "--design-filenames", + dest="design_filenames", + type=str, + nargs="+", + required=True, + ) + parser.add_argument("--out", dest="outfile", type=Path, required=True) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + + # files and names arrive in the same order + count_files = args.count_files.split(",") + design_files = args.design_files.split(",") + + count_names_to_files = { + name: file for file, name in zip(count_files, args.count_filenames) + } + design_names_to_files = { + name: file for file, name in zip(design_files, args.design_filenames) + } + + renamed_rows = [] + with open(args.samplesheet, "r", newline="") as fin: + reader = csv.DictReader(fin) + header = reader.fieldnames + for row in reader: + # getting original names (file names as written in the samplesheet) + original_count_filename = Path(row["counts"]).name + original_design_filename = Path(row["design"]).name + # turning original names into new names (Galaxy file names) + row["counts"] = count_names_to_files[original_count_filename] + row["design"] = design_names_to_files[original_design_filename] + renamed_rows.append(row) + + with open(args.outfile, "w", newline="") as fout: + writer = csv.DictWriter(fout, fieldnames=header) + + writer.writeheader() + for row in renamed_rows: + writer.writerow(row) diff --git a/galaxy/tool_shed/tool/test_data/input.csv b/galaxy/tool_shed/tool/test_data/input.csv new file mode 100644 index 00000000..6ea4aa16 --- /dev/null +++ b/galaxy/tool_shed/tool/test_data/input.csv @@ -0,0 +1,3 @@ +counts,design,platform,normalised +tests/test_data/input_datasets/microarray.normalised.csv,tests/test_data/input_datasets/microarray.normalised.design.csv,microarray,true +tests/test_data/input_datasets/rnaseq.raw.csv,tests/test_data/input_datasets/rnaseq.raw.design.csv,rnaseq,false diff --git a/galaxy/tool_shed/tool/test_data/microarray.normalised.csv b/galaxy/tool_shed/tool/test_data/microarray.normalised.csv new file mode 100644 index 00000000..81f3f904 --- /dev/null +++ b/galaxy/tool_shed/tool/test_data/microarray.normalised.csv @@ -0,0 +1,10 @@ +gene_id,GSM1528575,GSM1528576,GSM1528579,GSM1528583,GSM1528584,GSM1528585,GSM1528580,GSM1528586,GSM1528582,GSM1528578,GSM1528581,GSM1528577 +ENSRNA049453121,20925.1255070264,136184.261516502,144325.370645564,89427.0987612997,164143.182734208,34178.6378088171,28842.7323281157,76973.395782103,41906.9367255656,44756.5602263121,252562.049703724,6953.65643340122 +ENSRNA049453138,196173.051628372,16607.8367703051,344972.83715281,22602.4535330758,13678.598561184,104546.421532852,15451.4637472048,71664.8857281649,160643.257448002,91459.0578537683,88396.7173963033,281623.08555275 +ENSRNA049454388,91547.4240932405,11625.4857392136,84483.143792525,80582.6604222701,218857.576978944,58304.7350856292,42234.0009090266,88475.1675656357,87306.1181782617,17513.436610296,90922.3378933406,76490.2207674135 +ENSRNA049454416,20925.1255070264,106290.155329953,193607.204524536,47170.3378081581,392119.825420608,190998.270108096,90648.5873169351,81397.1541603848,83813.8734511313,165404.67909724,111127.301869638,194702.380135234 +ENSRNA049454647,99394.3461583754,91343.1022366783,3520.13099135521,71738.2220832404,118547.854196928,20105.0810640101,81377.7090686122,15040.7784861581,66352.6498154789,110918.431865208,55563.6509348192,111258.50293442 +ENSRNA049454661,175247.926121346,66431.3470812206,24640.9169394865,52083.9146631746,360203.095444512,36189.1459152181,70046.6356539953,85820.9125386666,13968.9789085219,50594.3724297441,25256.2049703724,52152.4232505092 +ENSRNA049454747,117703.830977024,154452.881963838,281610.479308417,29481.4611300988,191500.379856576,152798.616086476,53565.0743236435,14156.0268105017,293348.557078959,155674.99209152,63140.5124259309,243377.975169043 +ENSRNA049454887,2615.6406883783,164417.584026021,28161.0479308417,82548.0911642767,50154.861391008,136714.551235268,97859.270398964,64586.872322914,328271.004350264,159566.866893808,151537.229822234,86920.7054175153 +ENSRNA049454931,177863.566809724,81378.4001744952,235848.776420799,88444.3833902964,18238.131414912,120630.48638406,82407.8066517592,50430.8455124123,118736.320722436,68107.8090400402,232357.085727426,163410.926184929 diff --git a/galaxy/tool_shed/tool/test_data/microarray.normalised.design.csv b/galaxy/tool_shed/tool/test_data/microarray.normalised.design.csv new file mode 100644 index 00000000..d31e5cef --- /dev/null +++ b/galaxy/tool_shed/tool/test_data/microarray.normalised.design.csv @@ -0,0 +1,13 @@ +sample,condition +GSM1528575,g1 +GSM1528576,g1 +GSM1528579,g1 +GSM1528583,g2 +GSM1528584,g2 +GSM1528585,g2 +GSM1528580,g3 +GSM1528586,g3 +GSM1528582,g3 +GSM1528578,g4 +GSM1528581,g4 +GSM1528577,g4 diff --git a/galaxy/tool_shed/tool/test_data/rnaseq.raw.csv b/galaxy/tool_shed/tool/test_data/rnaseq.raw.csv new file mode 100644 index 00000000..a9a6bdb4 --- /dev/null +++ b/galaxy/tool_shed/tool/test_data/rnaseq.raw.csv @@ -0,0 +1,10 @@ +,ESM1528575,ESM1528576,ESM1528579,ESM1528583,ESM1528584,ESM1528585,ESM1528580,ESM1528586,ESM1528582,ESM1528578,ESM1528581,ESM1528577 +ENSRNA049453121,1,82,8,82,4,68,88,73,46,57,25,22 +ENSRNA049453138,68,93,41,84,36,18,28,92,84,85,92,32 +ENSRNA049454388,38,10,0,23,11,17,95,57,25,82,10,70 +ENSRNA049454416,75,55,7,30,79,60,15,97,12,35,60,56 +ENSRNA049454647,35,64,55,91,48,95,68,100,24,26,100,47 +ENSRNA049454661,8,99,80,48,86,29,80,17,19,9,44,2 +ENSRNA049454747,67,7,98,53,3,10,52,87,4,80,22,15 +ENSRNA049454887,8,40,24,90,42,52,79,81,94,23,35,81 +ENSRNA049454931,45,49,67,73,26,76,41,16,34,47,36,25 diff --git a/galaxy/tool_shed/tool/test_data/rnaseq.raw.design.csv b/galaxy/tool_shed/tool/test_data/rnaseq.raw.design.csv new file mode 100644 index 00000000..469751d2 --- /dev/null +++ b/galaxy/tool_shed/tool/test_data/rnaseq.raw.design.csv @@ -0,0 +1,13 @@ +sample,condition +ESM1528575,g1 +ESM1528576,g1 +ESM1528579,g1 +ESM1528583,g2 +ESM1528584,g2 +ESM1528585,g2 +ESM1528580,g3 +ESM1528586,g3 +ESM1528582,g3 +ESM1528578,g4 +ESM1528581,g4 +ESM1528577,g4 diff --git a/main.nf b/main.nf index 40987e52..be4d42ea 100644 --- a/main.nf +++ b/main.nf @@ -15,9 +15,10 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { STABLEEXPRESSION } from './workflows/stableexpression' +include { STABLEEXPRESSION } from './workflows/stableexpression' include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_stableexpression_pipeline' include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_stableexpression_pipeline' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ NAMED WORKFLOWS FOR PIPELINE @@ -30,16 +31,15 @@ include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_stab workflow NFCORE_STABLEEXPRESSION { take: - samplesheet // channel: samplesheet read in from --input + input_datasets main: // // WORKFLOW: Run pipeline // - STABLEEXPRESSION ( - samplesheet - ) + STABLEEXPRESSION( input_datasets ) + emit: multiqc_report = STABLEEXPRESSION.out.multiqc_report // channel: /path/to/multiqc_report.html } @@ -61,7 +61,7 @@ workflow { params.monochrome_logs, args, params.outdir, - params.input, + params.datasets, params.help, params.help_full, params.show_hidden @@ -71,7 +71,7 @@ workflow { // WORKFLOW: Run main workflow // NFCORE_STABLEEXPRESSION ( - PIPELINE_INITIALISATION.out.samplesheet + PIPELINE_INITIALISATION.out.input_datasets ) // // SUBWORKFLOW: Run completion tasks diff --git a/modules.json b/modules.json index 61b6f208..ca279222 100644 --- a/modules.json +++ b/modules.json @@ -7,7 +7,7 @@ "nf-core": { "multiqc": { "branch": "master", - "git_sha": "af27af1be706e6a2bb8fe454175b0cdf77f47b49", + "git_sha": "79b36b51048048374b642289bfe9e591ef56fe05", "installed_by": ["modules"] } } @@ -21,12 +21,12 @@ }, "utils_nfcore_pipeline": { "branch": "master", - "git_sha": "271e7fc14eb1320364416d996fb077421f3faed2", + "git_sha": "df4d1c8cdee98a1bbbed8fc51e82296568e0f9c1", "installed_by": ["subworkflows"] }, "utils_nfschema_plugin": { "branch": "master", - "git_sha": "4b406a74dc0449c0401ed87d5bfff4252fd277fd", + "git_sha": "e753770db613ce014b3c4bc94f6cba443427b726", "installed_by": ["subworkflows"] } } diff --git a/modules/local/aggregate_results/environment.yml b/modules/local/aggregate_results/environment.yml new file mode 100644 index 00000000..93ba05a4 --- /dev/null +++ b/modules/local/aggregate_results/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 + - conda-forge::pyyaml==6.0.3 diff --git a/modules/local/aggregate_results/main.nf b/modules/local/aggregate_results/main.nf new file mode 100644 index 00000000..56173a85 --- /dev/null +++ b/modules/local/aggregate_results/main.nf @@ -0,0 +1,43 @@ +process AGGREGATE_RESULTS { + debug true + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/7e/7e08ea26f496697870f6afe87a9def87c1038c000306c9280719d40ee9797293/data': + 'community.wave.seqera.io/library/polars_python_pyyaml:0d7b8bed8db11ef1' }" + + input: + path count_file + path stat_score_files + path platform_stat_files, stageAs: "?/*" + val target_genes + path metadata_files + path mapping_files + path multiqc_config + + output: + path 'all_genes_summary.csv', emit: all_genes_summary + path '*most_stable_genes_summary.csv', emit: most_stable_genes_summary + path '*most_stable_genes_transposed_counts.csv', emit: most_stable_genes_transposed_counts_filtered + path 'custom_content_multiqc_config.yaml', emit: custom_content_multiqc_config + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + tuple val("${task.process}"), val('pyyaml'), eval('python3 -c "import yaml; print(yaml.__version__)"'), topic: versions + + script: + def mapping_files_arg = mapping_files ? "--mappings " + "$mapping_files" : "" + def metadata_files_arg = metadata_files ? "--metadata " + "$metadata_files" : "" + def target_genes_arg = target_genes ? "--target-genes " + "${target_genes.join(' ')}" : "" + """ + aggregate_results.py \\ + --counts $count_file \\ + --stats-with-scores $stat_score_files \\ + --platform-stats $platform_stat_files \\ + --multiqc-config $multiqc_config \\ + $mapping_files_arg \\ + $metadata_files_arg \\ + $target_genes_arg + """ + +} diff --git a/modules/local/clean_gene_ids/environment.yml b/modules/local/clean_gene_ids/environment.yml new file mode 100644 index 00000000..df720d08 --- /dev/null +++ b/modules/local/clean_gene_ids/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.37.1 diff --git a/modules/local/clean_gene_ids/main.nf b/modules/local/clean_gene_ids/main.nf new file mode 100644 index 00000000..6165ab07 --- /dev/null +++ b/modules/local/clean_gene_ids/main.nf @@ -0,0 +1,27 @@ +process CLEAN_GENE_IDS { + + label 'process_low' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/87/878943dcc1b8e30cd535a41886e0f75fcd8bbe667b2d2b0bc4adb0c549539e64/data': + 'community.wave.seqera.io/library/polars_python:07cce0ec1b0aeb84' }" + + input: + tuple val(meta), path(count_file) + + output: + tuple val(meta), path('*.cleaned.parquet'), optional: true, emit: counts + tuple val(meta.dataset), path("failure_reason.txt"), optional: true, topic: id_cleaning_failure_reason + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + clean_gene_ids.py \\ + --count-file "$count_file" + """ + +} diff --git a/modules/local/collect_all_gene_ids/environment.yml b/modules/local/collect_all_gene_ids/environment.yml new file mode 100644 index 00000000..75afc696 --- /dev/null +++ b/modules/local/collect_all_gene_ids/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.2 + - conda-forge::tqdm==4.67.1 diff --git a/modules/local/collect_all_gene_ids/main.nf b/modules/local/collect_all_gene_ids/main.nf new file mode 100644 index 00000000..6f72b3c1 --- /dev/null +++ b/modules/local/collect_all_gene_ids/main.nf @@ -0,0 +1,25 @@ +process COLLECT_ALL_GENE_IDS { + + label "process_high" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/70/70c17cde84896904c0620d614cba74ff029f1255db64e66416e63c91b7c959a2/data': + 'community.wave.seqera.io/library/python_tqdm:4e039400f75bdad0' }" + + input: + path count_files, stageAs: "?/*" + + output: + path 'unique_gene_ids.txt', emit: unique_gene_ids + path 'gene_id_occurrences.csv', emit: gene_id_occurrences + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('tqdm'), eval('python3 -c "import tqdm; print(tqdm.__version__)"'), topic: versions + + script: + """ + collect_gene_ids.py \\ + --ids "$count_files" + """ + +} diff --git a/modules/local/collect_statistics/environment.yml b/modules/local/collect_statistics/environment.yml new file mode 100644 index 00000000..5d27c0af --- /dev/null +++ b/modules/local/collect_statistics/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.13.7 + - conda-forge::pandas==2.3.3 diff --git a/modules/local/collect_statistics/main.nf b/modules/local/collect_statistics/main.nf new file mode 100644 index 00000000..83e3d5c7 --- /dev/null +++ b/modules/local/collect_statistics/main.nf @@ -0,0 +1,25 @@ +process COLLECT_STATISTICS { + + tag "${file.baseName}" + label "process_high" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/3d/3d7126100b0eb7cb53dfb50291707ea8dda3b9738b76551ab73605d0acbe114b/data': + 'community.wave.seqera.io/library/pandas:2.3.3--5a902bf824a79745' }" + + input: + path file + + output: + path '*.transposed.csv', emit: csv + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions + + script: + """ + collect_statistics.py \\ + --file $file + """ + +} diff --git a/modules/local/compute_dataset_statistics/environment.yml b/modules/local/compute_dataset_statistics/environment.yml new file mode 100644 index 00000000..df720d08 --- /dev/null +++ b/modules/local/compute_dataset_statistics/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.12.8 + - conda-forge::polars==1.37.1 diff --git a/modules/local/compute_dataset_statistics/main.nf b/modules/local/compute_dataset_statistics/main.nf new file mode 100644 index 00000000..7a97e31c --- /dev/null +++ b/modules/local/compute_dataset_statistics/main.nf @@ -0,0 +1,27 @@ +process COMPUTE_DATASET_STATISTICS { + + label 'process_single' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/87/878943dcc1b8e30cd535a41886e0f75fcd8bbe667b2d2b0bc4adb0c549539e64/data': + 'community.wave.seqera.io/library/polars_python:07cce0ec1b0aeb84' }" + + input: + tuple val(meta), path(count_file) + + output: + tuple val(meta.dataset), path("skewness.txt"), topic: skewness + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def prefix = task.ext.prefix ?: "${meta.dataset}" + """ + compute_dataset_statistics.py \\ + --counts $count_file + """ + +} diff --git a/modules/local/compute_gene_statistics/environment.yml b/modules/local/compute_gene_statistics/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/compute_gene_statistics/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/compute_gene_statistics/main.nf b/modules/local/compute_gene_statistics/main.nf new file mode 100644 index 00000000..ba84fd16 --- /dev/null +++ b/modules/local/compute_gene_statistics/main.nf @@ -0,0 +1,36 @@ +process COMPUTE_GENE_STATISTICS { + + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file, name: 'count_file.parquet'), path(imputed_count_file, name: 'imputed_count_file.parquet') + path ratio_nulls_per_samples + val max_null_ratio_valid_sample + + output: + path '*stats_all_genes.csv', emit: stats + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def args = task.ext.args ?: '' + if ( meta.platform != "all" ) { + args += " --platform $meta.platform" + } + if ( imputed_count_file ) { + args += " --imputed-counts imputed_count_file.parquet" + } + """ + compute_gene_statistics.py \\ + --counts count_file.parquet \\ + --ratio-nulls-per-sample $ratio_nulls_per_samples \\ + --max-ratio-null-valid-sample $max_null_ratio_valid_sample \\ + $args + """ + +} diff --git a/modules/local/compute_gene_transcript_lengths/environment.yml b/modules/local/compute_gene_transcript_lengths/environment.yml new file mode 100644 index 00000000..5d27c0af --- /dev/null +++ b/modules/local/compute_gene_transcript_lengths/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.13.7 + - conda-forge::pandas==2.3.3 diff --git a/modules/local/compute_gene_transcript_lengths/main.nf b/modules/local/compute_gene_transcript_lengths/main.nf new file mode 100644 index 00000000..ada61de4 --- /dev/null +++ b/modules/local/compute_gene_transcript_lengths/main.nf @@ -0,0 +1,38 @@ +process COMPUTE_GENE_TRANSCRIPT_LENGTHS { + + label 'process_single' + + tag "${gff3.baseName}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/3d/3d7126100b0eb7cb53dfb50291707ea8dda3b9738b76551ab73605d0acbe114b/data': + 'community.wave.seqera.io/library/pandas:2.3.3--5a902bf824a79745' }" + + input: + path gff3 + + output: + path('gene_transcript_lengths.csv'), emit: csv + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions + + script: + def is_compressed = gff3.getExtension() == "gz" ? true : false + def gff3_name = is_compressed ? gff3.getBaseName() : gff3 + """ + if [ "${is_compressed}" == "true" ]; then + gzip -c -d ${gff3} > ${gff3_name} + fi + + compute_gene_transcript_lengths.py \\ + --annotation ${gff3_name} + """ + + + stub: + """ + touch gene_transcript_lengths.csv + """ + +} diff --git a/modules/local/compute_stability_scores/environment.yml b/modules/local/compute_stability_scores/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/compute_stability_scores/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/compute_stability_scores/main.nf b/modules/local/compute_stability_scores/main.nf new file mode 100644 index 00000000..48f8f0e3 --- /dev/null +++ b/modules/local/compute_stability_scores/main.nf @@ -0,0 +1,32 @@ +process COMPUTE_STABILITY_SCORES { + + tag "${meta.section}" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(normfinder_stability_file), path(genorm_stability_file), path(section_stat_file) + val stability_score_weights + + output: + path "${meta.section}.stats_with_scores.csv", emit: stats_with_stability_scores + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def genorm_stability_file_arg = genorm_stability_file ? "--genorm-stability $genorm_stability_file" : "" + """ + compute_stability_scores.py \\ + --stats $section_stat_file \\ + --weights "$stability_score_weights" \\ + --normfinder-stability $normfinder_stability_file \\ + $genorm_stability_file_arg + + mv stats_with_scores.csv ${meta.section}.stats_with_scores.csv + """ + +} diff --git a/modules/local/dash_app/app/app.py b/modules/local/dash_app/app/app.py new file mode 100755 index 00000000..27fc7bf6 --- /dev/null +++ b/modules/local/dash_app/app/app.py @@ -0,0 +1,91 @@ +import socket +import dash_mantine_components as dmc + +from dash_extensions.enrich import ( + DashProxy, + html, + ServersideOutputTransform, + TriggerTransform, +) +from dash_extensions.logging import NotificationsLogHandler + +from src.utils import config, style +from src.components import stores, tooltips +from src.components import top, right_sidebar +from src.callbacks import common, genes, samples + +DEBUG = True +# DEBUG = False + +# -------------------- SETUP LOGGING -------------------- + +log_handler = NotificationsLogHandler() +logger = log_handler.setup_logger(__name__) + +# -------------------- APP -------------------- +# init the application +logger.info("Creating app") + +app = DashProxy( + __name__, + title=config.APP_TITLE, + prevent_initial_callbacks="initial_duplicate", + suppress_callback_exceptions=(not DEBUG), + update_title=config.UPDATE_TITLE, + external_stylesheets=[dmc.styles.ALL], + transforms=[TriggerTransform(), ServersideOutputTransform()], +) + +# -------------------- LAYOUT -------------------- + + +def serve_layout(): + return dmc.MantineProvider( + children=[ + html.Div( + [ + top.header, + right_sidebar.drawer, + *stores.stores_to_load, + *tooltips.tooltips_to_load, + ] + + log_handler.embed(), + id="layout", + style=style.LAYOUT, + ) + ] + ) + + +app.layout = serve_layout + +# -------------------- IMPORTING CALLBACKS -------------------- + +common.register_callbacks() +genes.register_callbacks() +samples.register_callbacks() + +# -------------------- LAUNCH SERVER -------------------- + + +def find_port(port: int) -> int: + """Find a port not in use starting at given port""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + if s.connect_ex(("localhost", port)) == 0: + return find_port(port=port + 1) + else: + return port + + +if __name__ == "__main__": + logger.info("Running server") + # setting prune_errors to False avoids error message pruning + # in order to get original tracebacks + # (very useful for debugging) + prune_errors = False if DEBUG else True + app.run( + debug=DEBUG, + host=config.HOST, + port=find_port(port=config.PLOTLY_APP_PORT), + dev_tools_prune_errors=prune_errors, + ) diff --git a/modules/local/dash_app/app/assets/style.css b/modules/local/dash_app/app/assets/style.css new file mode 100755 index 00000000..f32fc1a6 --- /dev/null +++ b/modules/local/dash_app/app/assets/style.css @@ -0,0 +1,9 @@ +.modebar { + background: transparent; + left: 50%; + transform: translateX(-50%); +} + +.mantine-Drawer-root { + width: 0.1em !important; +} diff --git a/modules/local/dash_app/app/environment.yml b/modules/local/dash_app/app/environment.yml new file mode 100644 index 00000000..ba925002 --- /dev/null +++ b/modules/local/dash_app/app/environment.yml @@ -0,0 +1,16 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::pandas==2.3.3 + - conda-forge::polars==1.39.2 + - conda-forge::pyarrow==23.0.1 + - conda-forge::scipy==1.17.1 + - conda-forge::dash==3.3.0 + - conda-forge::dash-mantine-components==2.4.0 + - conda-forge::dash-extensions==2.0.4 + - conda-forge::dash-iconify==0.1.2 + - conda-forge::dash-ag-grid==32.3.2 diff --git a/modules/local/dash_app/app/src/callbacks/common.py b/modules/local/dash_app/app/src/callbacks/common.py new file mode 100644 index 00000000..bd349a21 --- /dev/null +++ b/modules/local/dash_app/app/src/callbacks/common.py @@ -0,0 +1,36 @@ +from dash_extensions.enrich import Input, Trigger, Output, State, callback + + +############################################## +############################################## +# CALLBACKS +############################################## +############################################## + + +def register_callbacks(): + @callback( + Output("drawer", "opened"), + Trigger("settings-button", "n_clicks"), + prevent_initial_call=True, + ) + def open_drawer(): + return True + + @callback( + Output("sidebar-genes-items", "style"), + Output("sidebar-samples-items", "style"), + Input("tabs", "value"), + State("sidebar-genes-items", "style"), + State("sidebar-samples-items", "style"), + ) + def manage_drawer_content( + tabs_value: str, gene_stack_style: dict, sample_stack_style: dict + ): + if tabs_value == "genes": + gene_stack_style["display"] = "block" + sample_stack_style["display"] = "none" + else: # tabs_value == 'samples': + gene_stack_style["display"] = "none" + sample_stack_style["display"] = "block" + return gene_stack_style, sample_stack_style diff --git a/modules/local/dash_app/app/src/callbacks/genes.py b/modules/local/dash_app/app/src/callbacks/genes.py new file mode 100644 index 00000000..10cd2f74 --- /dev/null +++ b/modules/local/dash_app/app/src/callbacks/genes.py @@ -0,0 +1,109 @@ +import plotly.graph_objects as go +from dash_extensions.enrich import Input, Output, Serverside, State, callback, ctx +from src.utils.data_management import DataManager + +data_manager = DataManager() + + +############################################## +############################################## +# CALLBACKS +############################################## +############################################## + + +def get_selected_rows(selected_genes: list[str]) -> list[dict]: + return data_manager.all_genes_stat_df.filter( + data_manager.all_genes_stat_df["gene_id"].is_in(selected_genes) + ).to_dicts() + + +def register_callbacks(): + @callback( + Output("gene-counts", "data"), + Output("gene-dropdown", "value"), + Output("gene-stats-table", "selectedRows"), + Input("gene-dropdown", "value"), + Input("gene-stats-table", "selectedRows"), + State("gene-counts", "data"), + # prevent_initial_call=True, + ) + def update_gene_stored_data( + selected_genes: list[str], table_selected_rows: list[dict], stored_data: dict + ) -> dict: + if ctx.triggered_id == "gene-stats-table": + # updating selected genes + if table_selected_rows is not None: + selected_genes = [row["gene_id"] for row in table_selected_rows] + else: + selected_genes = [] + else: + # ctx.triggered_id is None (callback triggered at app launch / refresh) + # or ctx.triggered_id == "gene-dropdown": + # taking the dropdown values as reference (since there is persistence on it) + table_selected_rows = get_selected_rows(selected_genes) + + # deleting stored data for genes not anymore in the selected list + for stored_gene in list( + stored_data.keys() + ): # we need to copy the list of keys before changing the dict + if stored_gene not in selected_genes: + del stored_data[stored_gene] + + # storing data for new genes in the selected list + for gene in selected_genes: + if gene not in stored_data: + gene_data = data_manager.get_gene_counts(gene) + stored_data[gene] = { + "counts": gene_data.to_list(), + "samples": gene_data.index.to_list(), + } + + return Serverside(stored_data), selected_genes, table_selected_rows + + @callback( + Output("gene-graph", "figure"), + Output("gene-graph", "style"), + Input("gene-counts", "data"), + Input("gene-graph-jitter", "value"), + Input("gene-graph-pointpos", "value"), + Input("gene-graph-boxmean", "value"), + Input("gene-graph-display-points", "value"), + State("gene-graph", "style"), + # prevent_initial_call=True, + ) + def update_gene_graph( + gene_stored_data: dict, + jitter: float, + pointpos: float, + boxmean: str | bool, + point_display_mode: str, + graph_style: dict, + ): + if not gene_stored_data: + graph_style["display"] = "none" + return {}, graph_style + + graph_style["display"] = "block" + + fig = go.Figure() + + # we need to use the reversed order, otherwise the last traced added is at the top of the graph + for gene, gene_data in reversed(gene_stored_data.items()): + fig.add_trace( + go.Box( + name=gene, + x=gene_data["counts"], + boxmean=boxmean, + jitter=jitter, + pointpos=pointpos, + boxpoints=point_display_mode, + customdata=gene_data["samples"], + hovertemplate="Sample: %{customdata}
Normalised count: %{x}
", + showlegend=False, + ) + ) + + fig.update_layout(xaxis=dict(range=[0, 1]), yaxis=dict(ticklabelstandoff=10)) + + return fig, graph_style diff --git a/modules/local/dash_app/app/src/callbacks/samples.py b/modules/local/dash_app/app/src/callbacks/samples.py new file mode 100644 index 00000000..1ac296c8 --- /dev/null +++ b/modules/local/dash_app/app/src/callbacks/samples.py @@ -0,0 +1,126 @@ +import plotly.graph_objects as go +import numpy as np +from scipy.stats import gaussian_kde +from dash_extensions.enrich import Input, Output, State, callback, Serverside + +from src.utils.data_management import DataManager + +data_manager = DataManager() + + +############################################## +############################################## +# CALLBACKS +############################################## +############################################## + + +def register_callbacks(): + @callback( + Output("sample-counts", "data"), + Input("sample-dropdown", "value"), + State("sample-counts", "data"), + # prevent_initial_call=True, + ) + def update_stored_data( + sample_dropdown_values: list[str], stored_sample_counts: dict + ): + updated_stored_sample_counts = dict(stored_sample_counts) # deep copy + + # deleting stored data for samples not anymore in the selected list + for stored_sample in ( + stored_sample_counts + ): # we need to copy the list of keys before changing the dict + if stored_sample not in sample_dropdown_values: + del updated_stored_sample_counts[stored_sample] + + # storing data for new samples in the selected list + for sample in sample_dropdown_values: + if sample not in updated_stored_sample_counts: + sample_data = data_manager.get_sample_counts(sample) + updated_stored_sample_counts[sample] = { + "counts": sample_data.to_list(), + "genes": sample_data.index.to_list(), + } + + return Serverside(updated_stored_sample_counts) + + @callback( + Output("sample-graph", "figure"), + Output("sample-graph", "style"), + Output("sample_stats_display_accordion_control", "disabled"), + Output("sample_points_display_accordion_control", "disabled"), + Output("sample_plot_customisation_accordion_control", "disabled"), + Input("sample-counts", "data"), + Input("curve-type", "value"), + Input("sample-graph-jitter", "value"), + Input("sample-graph-pointpos", "value"), + Input("sample-graph-boxmean", "value"), + Input("sample-graph-display-points", "value"), + State("sample-graph", "style"), + # prevent_initial_call=True, + ) + def update_sample_histogram( + sample_counts: dict, + curve_type: str, + jitter: float, + pointpos: float, + boxmean: str | bool, + point_display_mode: str, + graph_style: dict, + ): + if not sample_counts: + graph_style["display"] = "none" + return {}, graph_style, True, True, True + + graph_style["display"] = "block" + + fig = go.Figure() + + sample_stats_display_ac_disabled = True + sample_points_display_ac_disabled = True + sample_plot_customisation_ac_disabled = True + + # we need to use the reversed order, otherwise the last traced added is at the top of the graph + for sample, sample_data in reversed(sample_counts.items()): + counts = sample_data["counts"] + + if curve_type == "histogram": + fig.add_trace(go.Histogram(name=sample, x=counts)) + + elif curve_type == "kde": + kde_function = gaussian_kde(counts) + xvals = np.linspace(min(counts), max(counts), 1000) + yvals = kde_function(xvals) + fig.add_trace(go.Scatter(name=sample, x=xvals, y=yvals)) + + else: # boxplot + # we need to use the reversed order, otherwise the last traced added is at the top of the graph + fig.add_trace( + go.Box( + name=sample, + x=counts, + jitter=jitter, + pointpos=pointpos, + boxpoints=point_display_mode, + boxmean=boxmean, + customdata=sample_data["genes"], + hovertemplate="Gene: %{customdata}
Count: %{x}
", + ) + ) + # update the layout to remove y-axis labels + fig.update_layout(yaxis=dict(showticklabels=False)) + + sample_stats_display_ac_disabled = False + sample_points_display_ac_disabled = False + sample_plot_customisation_ac_disabled = False + + fig.update_layout(xaxis=dict(range=[0, 1]), yaxis=dict(ticklabelstandoff=10)) + + return ( + fig, + graph_style, + sample_stats_display_ac_disabled, + sample_points_display_ac_disabled, + sample_plot_customisation_ac_disabled, + ) diff --git a/modules/local/dash_app/app/src/components/graphs.py b/modules/local/dash_app/app/src/components/graphs.py new file mode 100644 index 00000000..94c7f66c --- /dev/null +++ b/modules/local/dash_app/app/src/components/graphs.py @@ -0,0 +1,11 @@ +from dash_extensions.enrich import dcc +from src.utils import style + + +def get_graph(graph_id: str): + return dcc.Graph(id=graph_id, figure={}, style=style.GRAPH) + + +gene_graph = get_graph("gene-graph") + +sample_graph = get_graph("sample-graph") diff --git a/modules/local/dash_app/app/src/components/icons.py b/modules/local/dash_app/app/src/components/icons.py new file mode 100755 index 00000000..6801d88b --- /dev/null +++ b/modules/local/dash_app/app/src/components/icons.py @@ -0,0 +1,7 @@ +from dash_iconify import DashIconify + +# all dash-iconify icons can be found at +# https://icon-sets.iconify.design/ +# --------------- SIDEBAR --------------------- +magnifying_glass_icon = DashIconify(icon="radix-icons:magnifying-glass") +data_loaded_icon = DashIconify(icon="akar-icons:circle-check", color="white", width=30) diff --git a/modules/local/dash_app/app/src/components/right_sidebar.py b/modules/local/dash_app/app/src/components/right_sidebar.py new file mode 100644 index 00000000..98aaf072 --- /dev/null +++ b/modules/local/dash_app/app/src/components/right_sidebar.py @@ -0,0 +1,19 @@ +import dash_mantine_components as dmc +from src.components.settings import genes, samples +from src.utils import style + +drawer = dmc.Drawer( + children=[ + genes.sidebar_stack, + samples.sidebar_stack, + ], + id="drawer", + opened=True, + position="right", + withCloseButton=True, + closeOnEscape=True, + overlayProps=dict(backgroundOpacity=0), + trapFocus=False, + zIndex=10000, + style=style.SIDEBAR, +) diff --git a/modules/local/dash_app/app/src/components/settings/genes.py b/modules/local/dash_app/app/src/components/settings/genes.py new file mode 100644 index 00000000..764b41c9 --- /dev/null +++ b/modules/local/dash_app/app/src/components/settings/genes.py @@ -0,0 +1,140 @@ +import dash_mantine_components as dmc +from src.utils import style +from src.utils.data_management import DataManager + +data_manager = DataManager() + +sorted_genes = data_manager.get_sorted_genes() + +nb_sections = data_manager.get_nb_sections() + +gene_selection_stack = dmc.Stack( + [ + dmc.MultiSelect( + id="gene-dropdown", + label=dmc.Text("Genes to display", fw=600, style={"paddingBottom": "5px"}), + placeholder="Select genes of interest", + nothingFoundMessage="No gene found", + data=sorted_genes, + value=sorted_genes[:nb_sections], + w=400, + clearable=True, + searchable=True, + limit=100, + maxValues=20, + size="sm", + checkIconPosition="right", + hidePickedOptions=True, + disabled=False, + persistence=True, + persisted_props=["value"], + persistence_type="session", + # style=style.DROPDOWN, + comboboxProps={ + "shadow": "md", + "transitionProps": {"transition": "pop", "duration": 200}, + }, + ) + ], + align="stretch", + gap="xl", +) + +gene_graph_stats_display_stack = dmc.Stack( + [ + dmc.Text( + "Display mean / standard deviation", style=style.STACK_SUBSECTION_TITLE + ), + dmc.SegmentedControl( + id="gene-graph-boxmean", + value="sd", + color="teal", + data=[ + {"value": False, "label": "None"}, + {"value": True, "label": "Mean only"}, + {"value": "sd", "label": "Mean + Std"}, + ], + mb=10, + ), + ], + align="center", + gap="xl", +) + +gene_graph_points_display_stack = dmc.Stack( + [ + dmc.Text("Display points", style=style.STACK_SUBSECTION_TITLE), + dmc.SegmentedControl( + id="gene-graph-display-points", + value="outliers", + color="teal", + data=[ + {"value": "outliers", "label": "Outliers"}, + {"value": "suspectedoutliers", "label": "Suspected Outliers"}, + {"value": "all", "label": "All points"}, + ], + mb=10, + ), + dmc.Text( + "Position of points relatively to boxes", style=style.STACK_SUBSECTION_TITLE + ), + dmc.Slider( + id="gene-graph-pointpos", + value=-1.8, + color="teal", + min=-2, + max=2, + step=0.1, + persistence=True, + persisted_props=["value"], + persistence_type="session", + mb=35, + ), + dmc.Text( + "Spreading of displayed points (jitter)", style=style.STACK_SUBSECTION_TITLE + ), + dmc.Slider( + id="gene-graph-jitter", + value=0.3, + color="teal", + min=0, + max=1, + step=0.1, + persistence=True, + persisted_props=["value"], + persistence_type="session", + mb=35, + ), + ], + align="center", + gap="xl", +) + +sidebar_stack = dmc.Accordion( + value="gene_selection", + children=[ + dmc.AccordionItem( + [ + dmc.AccordionControl("Gene selection"), + dmc.AccordionPanel(gene_selection_stack), + ], + value="gene_selection", + ), + dmc.AccordionItem( + [ + dmc.AccordionControl("Statistics display"), + dmc.AccordionPanel(gene_graph_stats_display_stack), + ], + value="gene_stats_display", + ), + dmc.AccordionItem( + [ + dmc.AccordionControl("Points display"), + dmc.AccordionPanel(gene_graph_points_display_stack), + ], + value="gene_points_display", + ), + ], + id="sidebar-genes-items", + style={"marginTop": "20px", "display": "none"}, +) diff --git a/modules/local/dash_app/app/src/components/settings/samples.py b/modules/local/dash_app/app/src/components/settings/samples.py new file mode 100644 index 00000000..04126f1c --- /dev/null +++ b/modules/local/dash_app/app/src/components/settings/samples.py @@ -0,0 +1,176 @@ +import dash_mantine_components as dmc +from src.utils import style +from src.utils.data_management import DataManager + +data_manager = DataManager() + +sorted_samples = data_manager.get_sorted_samples() + +NB_SAMPLES_DEFAULT = 10 + +sample_selection_stack = dmc.Stack( + [ + dmc.MultiSelect( + id="sample-dropdown", + label="Select list of samples to visualise", + placeholder="Select samples", + nothingFoundMessage="No samples found", + data=sorted_samples, + value=sorted_samples[:NB_SAMPLES_DEFAULT], + w=400, + clearable=True, + searchable=True, + limit=100, + maxValues=20, + size="sm", + checkIconPosition="right", + hidePickedOptions=True, + disabled=False, + persistence=True, + persisted_props=["value"], + persistence_type="session", + # style=style.DROPDOWN, + comboboxProps={ + "shadow": "md", + "transitionProps": {"transition": "pop", "duration": 200}, + }, + ) + ], + align="left", + gap="xl", +) + + +sample_graph_plot_type_stack = dmc.Stack( + [ + dmc.Text("Type of plot", style=style.STACK_SUBSECTION_TITLE), + dmc.SegmentedControl( + id="curve-type", + value="ng", + color="teal", + data=[ + {"value": "histogram", "label": "Histogram"}, + {"value": "kde", "label": "Kde"}, + {"value": "boxplot", "label": "Box-plot"}, + ], + mb=10, + ), + ], + align="left", + gap="xl", +) + + +sample_graph_stats_display_stack = dmc.Stack( + [ + dmc.Text( + "Display mean / standard deviation", style=style.STACK_SUBSECTION_TITLE + ), + dmc.SegmentedControl( + id="sample-graph-boxmean", + value="sd", + color="teal", + data=[ + {"value": False, "label": "None"}, + {"value": True, "label": "Mean only"}, + {"value": "sd", "label": "Mean + Std"}, + ], + mb=10, + ), + ], + align="left", + gap="xl", +) + +sample_graph_points_display_stack = dmc.Stack( + [ + dmc.Text("Display points", style=style.STACK_SUBSECTION_TITLE), + dmc.SegmentedControl( + id="sample-graph-display-points", + value="outliers", + color="teal", + data=[ + {"value": "outliers", "label": "Outliers"}, + {"value": "suspectedoutliers", "label": "Suspected Outliers"}, + {"value": "all", "label": "All points"}, + ], + mb=10, + ), + dmc.Text( + "Position of points relatively to boxes", style=style.STACK_SUBSECTION_TITLE + ), + dmc.Slider( + id="sample-graph-pointpos", + value=-1.8, + color="teal", + min=-2, + max=2, + step=0.1, + persistence=True, + persisted_props=["value"], + persistence_type="session", + mb=35, + ), + dmc.Text( + "Spreading of displayed points (jitter)", style=style.STACK_SUBSECTION_TITLE + ), + dmc.Slider( + id="sample-graph-jitter", + value=0.3, + color="teal", + min=0, + max=1, + step=0.1, + persistence=True, + persisted_props=["value"], + persistence_type="session", + mb=35, + ), + ], + align="left", + gap="xl", +) + + +sidebar_stack = dmc.Accordion( + value="sample_selection", + children=[ + dmc.AccordionItem( + [ + dmc.AccordionControl("Sample selection"), + dmc.AccordionPanel(sample_selection_stack), + ], + value="sample_selection", + ), + dmc.AccordionItem( + [ + dmc.AccordionControl( + "Plot customisation", + id="sample_plot_customisation_accordion_control", + ), + dmc.AccordionPanel(sample_graph_plot_type_stack), + ], + value="sample_plot_customisation", + ), + dmc.AccordionItem( + [ + dmc.AccordionControl( + "Statistics display", id="sample_stats_display_accordion_control" + ), + dmc.AccordionPanel(sample_graph_stats_display_stack), + ], + value="sample_stats_display", + ), + dmc.AccordionItem( + [ + dmc.AccordionControl( + "Points display", id="sample_points_display_accordion_control" + ), + dmc.AccordionPanel(sample_graph_points_display_stack), + ], + value="sample_points_display", + ), + ], + id="sidebar-samples-items", + style={"marginTop": "20px", "display": "none"}, +) diff --git a/modules/local/dash_app/app/src/components/stores.py b/modules/local/dash_app/app/src/components/stores.py new file mode 100755 index 00000000..ee748532 --- /dev/null +++ b/modules/local/dash_app/app/src/components/stores.py @@ -0,0 +1,9 @@ +from dash_extensions.enrich import dcc + +selected_samples = dcc.Store("selected-sample", storage_type="session") +gene_counts = dcc.Store(id="gene-counts", storage_type="session", data={}) +sample_counts = dcc.Store(id="sample-counts", storage_type="session", data={}) +stores_to_load = [ + gene_counts, + sample_counts, +] diff --git a/modules/local/dash_app/app/src/components/tables.py b/modules/local/dash_app/app/src/components/tables.py new file mode 100644 index 00000000..3c832931 --- /dev/null +++ b/modules/local/dash_app/app/src/components/tables.py @@ -0,0 +1,46 @@ +import dash_ag_grid as dag +from src.utils import style +from src.utils.data_management import DataManager + +data_manager = DataManager() + +NB_GENES_SELECTED_DEFAULT = 10 + +row_data = data_manager.get_table_raw_data() +# default_selected_rows = data_manager.all_genes_stat_df.head(NB_GENES_SELECTED_DEFAULT).to_dicts() + +column_defs = [ + {"field": col, "headerName": col.replace("_", " ").capitalize()} + for col in data_manager.all_genes_stat_df.columns +] + + +all_genes_stats_table = dag.AgGrid( + rowData=row_data, + columnDefs=column_defs, + className="ag-theme-alpine", + # columnSizeOptions=dict(skipHeader=False), + # columnSize="autoSizetoFit", + defaultColDef=dict( + filter=True, + resizable=True, + editable=False, + sortable=True, + ), + dashGridOptions=dict( + pagination=True, + paginationAutoPageSize=True, + enableCellTextSelection=True, + ensureDomOrder=True, + animateRows=False, + rowSelection=dict(mode="multiRow"), + headerCheckboxSelection=False, + getRowId="params.data.gene_id", + ), + # selectedRows=default_selected_rows, + style=style.AG_GRID, + # persistence=True, + # persistence_type="session", + # persisted_props=["selectedRows"], + id="gene-stats-table", +) diff --git a/modules/local/dash_app/app/src/components/tooltips.py b/modules/local/dash_app/app/src/components/tooltips.py new file mode 100644 index 00000000..cf0e9421 --- /dev/null +++ b/modules/local/dash_app/app/src/components/tooltips.py @@ -0,0 +1,43 @@ +import dash_mantine_components as dmc + + +def get_tooltip( + classname: str, label: str, position: str = "bottom", multiline: bool = True +): + return dmc.Tooltip( + target=f".{classname}", + label=label, + multiline=multiline, + position=position, + color="grey", + withArrow=True, + arrowSize=8, + zIndex=20000, + radius=4, + transitionProps={ + "transition": "fade", + "duration": 200, + "timingFunction": "ease", + }, + ) + + +genes_tabitem_tooltip = get_tooltip( + classname="genes-tabitem", label="Distribution of normalised counts gene per gene" +) + +samples_tabitem_tooltip = get_tooltip( + classname="samples-tabitem", + label="Distribution of normalised counts sample per sample", +) + +settings_button_tooltip = get_tooltip( + classname="settings-button", + label="Open settings to select genes / samples and to customise display", +) + +tooltips_to_load = [ + genes_tabitem_tooltip, + samples_tabitem_tooltip, + settings_button_tooltip, +] diff --git a/modules/local/dash_app/app/src/components/top.py b/modules/local/dash_app/app/src/components/top.py new file mode 100755 index 00000000..2ac66959 --- /dev/null +++ b/modules/local/dash_app/app/src/components/top.py @@ -0,0 +1,91 @@ +import dash_mantine_components as dmc +from dash_iconify import DashIconify +from src.components import graphs, tables +from src.utils import style + +gene_icon = DashIconify(icon="material-symbols:genetics", width=20) + +sample_icon = DashIconify(icon="ic:baseline-dashboard-customize", width=20) + + +tabs = dmc.Tabs( + children=[ + dmc.TabsList( + children=[ + dmc.TabsTab( + dmc.Text("Counts / gene", fw=500), + className="genes-tabitem", + color="teal", + leftSection=gene_icon, + value="genes", + style=style.HEADER_TABLIST_ITEM, + ), + dmc.TabsTab( + dmc.Text("Counts / sample", fw=500), + className="samples-tabitem", + leftSection=sample_icon, + value="samples", + color="red", + style=style.HEADER_TABLIST_ITEM, + ), + dmc.TabsTab( + dmc.Text("Statistics - all genes", fw=500), + leftSection=sample_icon, + value="gene_stats", + color="orange", + style=style.HEADER_TABLIST_ITEM, + ), + ], + style=style.HEADER_TABLIST, + ), + dmc.TabsPanel( + children=[ + dmc.Text("dhkhg"), + graphs.gene_graph, + ], + style=style.TABS_PANEL, + value="genes", + ), + dmc.TabsPanel( + children=[ + graphs.sample_graph, + ], + style=style.TABS_PANEL, + value="samples", + ), + dmc.TabsPanel( + children=[tables.all_genes_stats_table], + style=style.TABS_PANEL, + value="gene_stats", + ), + ], + id="tabs", + variant="default", + radius="md", + orientation="horizontal", + placement="right", + value="genes", + persistence=True, + persisted_props=["value"], + persistence_type="session", + style=style.TAB, +) + +settings_button = dmc.Button( + "Select data / options", + id="settings-button", + className="settings-button", + color="teal", + style=style.SETTINGS_BUTTON, +) + +header = dmc.Grid( + children=[ + dmc.GridCol(tabs, span=10), + dmc.GridCol( + settings_button, span=2, style={"textAlign": "right", "marginTop": "20px"} + ), + ], + style={"marginRight": "20px"}, + # gutter="xl", +) diff --git a/modules/local/dash_app/app/src/utils/config.py b/modules/local/dash_app/app/src/utils/config.py new file mode 100644 index 00000000..070a8b04 --- /dev/null +++ b/modules/local/dash_app/app/src/utils/config.py @@ -0,0 +1,31 @@ +PLOTLY_APP_PORT = 8080 +HOST = "0.0.0.0" + +LOGO_FILENAME = "assets/nf-core-stableexpression_logo_light_small.png" + +LOGGING_FORMAT = "[%(asctime)s] [%(name)s] %(levelname)s - %(message)s" +DATE_FORMAT = "%Y-%m-%d_%H-%M-%S" + +APP_TITLE = "Counts" +UPDATE_TITLE = "Updating ..." + +DATA_FOLDER = "data" + +ALL_COUNT_FILENAME = "all_counts.imputed.parquet" +ALL_GENES_STAT_FILENAME = "all_genes_summary.csv" +ALL_DESIGNS_FILENAME = "whole_design.csv" + +GENE_ID_COLNAME = "gene_id" +STD_COLNAME = "standard_deviation" +STABILITY_SCORE_COLNAME = "stability_score" +RANK_COLNAME = "rank" +SECTION_COLNAME = "section" + +AG_GRID_DEFAULT_COLUMN_DEF = { + "filter": True, + "resizable": True, + "editable": False, + "sortable": True, +} + +AG_GRID_DEFAULT_OPTIONS = {"pagination": True, "paginationAutoPageSize": True} diff --git a/modules/local/dash_app/app/src/utils/data_management.py b/modules/local/dash_app/app/src/utils/data_management.py new file mode 100644 index 00000000..bfa09ba7 --- /dev/null +++ b/modules/local/dash_app/app/src/utils/data_management.py @@ -0,0 +1,93 @@ +from functools import lru_cache + +import pandas as pd +import polars as pl +from src.utils import config + + +@lru_cache(maxsize=None) +class DataManager: + def __init__(self): + self.all_counts_lf: pl.LazyFrame = self.get_all_count_data() + self.all_genes_stat_df: pl.DataFrame = self.get_all_genes_stat_data() + + @staticmethod + def get_all_count_data() -> pl.LazyFrame: + file = f"{config.DATA_FOLDER}/{config.ALL_COUNT_FILENAME}" + return pl.scan_parquet(file) + + def get_sorted_samples(self) -> list[str]: + return sorted( + self.all_counts_lf.select(pl.exclude(config.GENE_ID_COLNAME)) + .collect_schema() + .names() + ) + + def get_all_genes_stat_data(self) -> pl.DataFrame: + file = f"{config.DATA_FOLDER}/{config.ALL_GENES_STAT_FILENAME}" + stat_df = pl.read_csv(file) + cols_to_select = ["rank"] + [ + col for col in stat_df.columns if col not in ["rank", "is_candidate"] + ] + return stat_df.select(cols_to_select) + + """ + def get_samples_grouped_by_dataset(self) -> list[dict]: + + samples_grouped_by_dataset = [] + + design_file = f"{config.DATA_FOLDER}/{config.ALL_DESIGNS_FILENAME}" + design_df = pd.read_csv(design_file) + + for group, samples in design_df.groupby(["batch", "condition"])["sample"]: + batch, condition = group # unpacking + batch_condition_samples_dict = { + "group": f"Dataset: {batch} || Condition: {condition}", + "items": [ + {"value": sample, "label": sample} + for sample in samples.to_list() + if sample in samples_in_count_data + ], + } + samples_grouped_by_dataset.append(batch_condition_samples_dict) + + return samples_grouped_by_dataset + """ + + def get_sorted_genes(self) -> list[str]: + return ( + self.all_genes_stat_df.sort( + by=[config.RANK_COLNAME, config.SECTION_COLNAME], + descending=False, + ) + .select(config.GENE_ID_COLNAME) + .to_series() + .to_list() + ) + + def get_gene_counts(self, gene: str) -> pd.Series: + return ( + self.all_counts_lf.filter(pl.col(config.GENE_ID_COLNAME) == gene) + .select(pl.exclude(config.GENE_ID_COLNAME)) + .collect() + .to_pandas() + .iloc[0] + ) + + def get_sample_counts(self, sample: str) -> pd.Series: + return ( + self.all_counts_lf.select(sample) + .drop_nulls() + .collect() + .to_pandas() + .iloc[:, 0] + ) + + def get_nb_sections(self) -> int: + return self.all_genes_stat_df.select(config.SECTION_COLNAME).n_unique() + + def get_table_raw_data(self) -> list[dict]: + return self.all_genes_stat_df.sort( + by=[config.RANK_COLNAME, config.SECTION_COLNAME], + descending=False, + ).to_dicts() diff --git a/modules/local/dash_app/app/src/utils/style.py b/modules/local/dash_app/app/src/utils/style.py new file mode 100644 index 00000000..65814f06 --- /dev/null +++ b/modules/local/dash_app/app/src/utils/style.py @@ -0,0 +1,83 @@ +LAYOUT = { + "left": "0px", + "top": "0px", + "position": "absolute", + "width": "100%", + "height": "100%", +} + +HEADER_HEIGHT = "5em" + +TAB = { + "position": "fixed", + "top": 0, + "left": 10, + "right": 0, + "width": "100%", + "height": "100%", + #'zIndex': '1001', +} + +HEADER_TABLIST = { + "position": "fixed", + "top": 0, + "left": 10, + "right": 0, + "width": "60%", + "height": HEADER_HEIGHT, + #'zIndex': '1001' +} + +HEADER_TABLIST_ITEM = { + #'width': '15vh', + # "text-align": "center", + "paddingRight": "20px", + #'paddingTop': '26px', + #'paddingBottom': '26px', + #'width': LEFT_SIDEBAR_WIDTH +} + +TABS_PANEL = { + "margin-top": HEADER_HEIGHT, + "height": f"calc(100% - {HEADER_HEIGHT})", +} + + +SETTINGS_BUTTON = { + "right": "20px", +} + +SIDEBAR_WIDTH = "15em" + +SIDEBAR = { + "position": "fixed", + "top": HEADER_HEIGHT, + "bottom": 0, + "width": SIDEBAR_WIDTH, + "height": "100vh", + "alignItems": "center", +} + + +DROPDOWN = {"marginTop": "10px", "paddingLeft": "4.2em", "paddingRight": "4.5em"} + +STACK_SUBSECTION_TITLE = {"marginBottom": "-20px"} + +AG_GRID = { + "height": "calc(100% - 10px)", + "top": HEADER_HEIGHT, + "paddingTop": "10px", + "marginRight": "15px", + "paddingRight": "25px", + "marginLeft": "5px", +} + +GRAPH = { + #'width': '100vh', + "top": HEADER_HEIGHT, + "marginLeft": "0px", + "marginRight": "3em", + "marginTop": "2px", + "marginBottom": "3px", + "display": "none", +} diff --git a/modules/local/dash_app/main.nf b/modules/local/dash_app/main.nf new file mode 100644 index 00000000..d64416b7 --- /dev/null +++ b/modules/local/dash_app/main.nf @@ -0,0 +1,59 @@ +process DASH_APP { + + label 'process_high' + + conda "${moduleDir}/app/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/fc/fc4abd76b9424d5f5397a6c97e8ed8c2e3a5a454773595204ceb55b39057d812/data': + 'community.wave.seqera.io/library/dash-ag-grid_dash-extensions_dash-iconify_dash-mantine-components_pruned:be6021fe1944629c' }" + + errorStrategy { + if (task.exitStatus == 100) { + log.warn("Could not start the Dash application.") + return 'ignore' // only report errors but ignores it + } else { + log.warn("Could not start the Dash application due to unhandled error.") + return 'ignore' // ignore anyway + } + } + + input: + path all_counts + path whole_design + path all_genes_summary + + output: + path("*"), emit: app + path "versions.yml", emit: versions + + script: + """ + # limiting number of threads to polars / python + export POLARS_MAX_THREADS=${task.cpus} + export OMP_NUM_THREADS=${task.cpus} + + mkdir -p data + mv ${all_counts} ${whole_design} ${all_genes_summary} data/ + cp -r ${moduleDir}/app/* . + + # as of Nextflow version 25.04.8, having these versions sent to the versions topic channel + # results in ERROR ~ No such file or directory: /.command.env + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$( python3 --version | sed "s/Python //" ) + dash: \$( python3 -c "import dash; print(dash.__version__)" ) + dash-extensions: \$( python3 -c "import dash_extensions; print(dash_extensions.__version__)" ) + dash-mantine-components: \$( python3 -c "import dash_mantine_components; print(dash_mantine_components.__version__)" ) + dash-ag-grid: \$( python3 -c "import dash_ag_grid; print(dash_ag_grid.__version__)" ) + polars: \$( python3 -c "import polars; print(polars.__version__)" ) + pandas: \$( python3 -c "import pandas; print(pandas.__version__)" ) + pyarrow: \$( python3 -c "import pyarrow; print(pyarrow.__version__)" ) + scipy: \$( python3 -c "import scipy; print(scipy.__version__)" ) + END_VERSIONS + + # trying to launch the app + # if the resulting exit code is not 124 (exit code of timeout) then there is an error + timeout 10 python -B app.py || exit_code=\$?; [ "\$exit_code" -eq 124 ] && exit 0 || exit 100 + """ + +} diff --git a/modules/local/detect_rare_genes/environment.yml b/modules/local/detect_rare_genes/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/detect_rare_genes/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/detect_rare_genes/main.nf b/modules/local/detect_rare_genes/main.nf new file mode 100644 index 00000000..1e2af20d --- /dev/null +++ b/modules/local/detect_rare_genes/main.nf @@ -0,0 +1,40 @@ +process DETECT_RARE_GENES { + + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + path(gene_id_mapping_file) + path(gene_id_occurrences_file) + val(nb_datasets) + val(min_occurrence_frequency) + val(min_occurrence_quantile) + + output: + path('valid_gene_ids.txt'), emit: valid_gene_ids + path('total_gene_id_occurrence_quantiles.csv'), topic: total_gene_id_occurrence_quantiles + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + detect_rare_genes.py \\ + --occurrences $gene_id_occurrences_file \\ + --mappings $gene_id_mapping_file \\ + --nb-datasets $nb_datasets \\ + --min-occurrence-frequency $min_occurrence_frequency \\ + --min-occurrence-quantile $min_occurrence_quantile + + """ + + + stub: + """ + touch fake.validated_genes.txt + """ + +} diff --git a/modules/local/download_ensembl_annotation/environment.yml b/modules/local/download_ensembl_annotation/environment.yml new file mode 100644 index 00000000..8d1ff111 --- /dev/null +++ b/modules/local/download_ensembl_annotation/environment.yml @@ -0,0 +1,12 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::pandas==3.0.1 + - conda-forge::httpx==0.28.1 + - conda-forge::tqdm==4.67.3 + - conda-forge::bs4==4.14.3 + - conda-forge::tenacity==9.1.4 diff --git a/modules/local/download_ensembl_annotation/main.nf b/modules/local/download_ensembl_annotation/main.nf new file mode 100644 index 00000000..16bb27cb --- /dev/null +++ b/modules/local/download_ensembl_annotation/main.nf @@ -0,0 +1,34 @@ +process DOWNLOAD_ENSEMBL_ANNOTATION { + + label 'process_single' + + tag "${species}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/98/980a21a12b628a41a6c08a91d4f6646d1122f0d0e38387f724d4f4ee020b8b1d/data': + 'community.wave.seqera.io/library/bs4_httpx_pandas_python_pruned:13dbe891a99b6884' }" + + input: + val species + + output: + path "*.gff3.gz", emit: gff3 + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('httpx'), eval('python3 -c "import httpx; print(httpx.__version__)"'), topic: versions + tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions + tuple val("${task.process}"), val('bs4'), eval('python3 -c "import bs4; print(bs4.__version__)"'), topic: versions + tuple val("${task.process}"), val('tqdm'), eval('python3 -c "import tqdm; print(tqdm.__version__)"'), topic: versions + + script: + """ + download_latest_ensembl_annotation.py \\ + --species ${species} + """ + + stub: + """ + touch fake.gff3.gz.txt + """ + +} diff --git a/modules/local/expressionatlas/getaccessions/environment.yml b/modules/local/expressionatlas/getaccessions/environment.yml new file mode 100644 index 00000000..58ac41c2 --- /dev/null +++ b/modules/local/expressionatlas/getaccessions/environment.yml @@ -0,0 +1,12 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::pandas==3.0.1 + - conda-forge::httpx==0.28.1 + - conda-forge::tenacity==9.1.4 + - conda-forge::pyyaml==6.0.3 + - conda-forge::nltk==3.9.2 diff --git a/modules/local/expressionatlas/getaccessions/main.nf b/modules/local/expressionatlas/getaccessions/main.nf new file mode 100644 index 00000000..43434bd2 --- /dev/null +++ b/modules/local/expressionatlas/getaccessions/main.nf @@ -0,0 +1,69 @@ +process EXPRESSIONATLAS_GETACCESSIONS { + + label 'process_high' + + tag "${species}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/f9/f943893a85d82f720432e83fd8d4755e5b42a92deca9d49c06930eaa7fc0c968/data': + 'community.wave.seqera.io/library/httpx_nltk_pandas_python_pruned:ab2f10d1d67a7603' }" + + input: + val species + val keywords + val platform + val random_sampling_size + val random_sampling_seed + + output: + path "accessions.txt", optional: true, emit: accessions + env("SAMPLING_QUOTA"), emit: sampling_quota + path "selected_experiments.metadata.tsv", optional: true, topic: eatlas_selected_datasets + path "species_experiments.metadata.tsv", optional: true, topic: eatlas_all_datasets + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('httpx'), eval('python3 -c "import httpx; print(httpx.__version__)"'), topic: versions + tuple val("${task.process}"), val('nltk'), eval('python3 -c "import nltk; print(nltk.__version__)"'), topic: versions + tuple val("${task.process}"), val('pyyaml'), eval('python3 -c "import yaml; print(yaml.__version__)"'), topic: versions + tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions + + script: + def keywords_string = keywords.split(',').collect { it.trim() }.join(' ') + def args = " --species $species" + if ( keywords_string != "" ) { + args += " --keywords $keywords_string" + } + if ( platform ) { + args += " --platform $platform" + } + if ( random_sampling_size ) { + args += " --random-sampling-size $random_sampling_size" + } + if ( random_sampling_seed ) { + args += " --random-sampling-seed $random_sampling_seed" + } + """ + # limiting CPU usage + export OMP_NUM_THREADS=${task.cpus} + + # the folder where nltk will download data needs to be writable (necessary for singularity) + export NLTK_DATA=\${PWD} + + get_eatlas_accessions.py \\ + $args \\ + --cpus ${task.cpus} + + SAMPLING_QUOTA=\$(cat sampling_quota.txt) + """ + + stub: + """ + touch accessions.txt \\ + all_experiments.metadata.tsv \\ + filtered_experiments.metadata.tsv \\ + filtered_experiments.keywords.yaml + + SAMPLING_QUOTA="ok" + """ + +} diff --git a/modules/local/expressionatlas/getdata/environment.yml b/modules/local/expressionatlas/getdata/environment.yml new file mode 100644 index 00000000..cdb6c8ed --- /dev/null +++ b/modules/local/expressionatlas/getdata/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::r-base==4.4.3 + - conda-forge::r-optparse==1.7.5 + - bioconda::bioconductor-expressionatlas==1.34.0 diff --git a/modules/local/expressionatlas/getdata/main.nf b/modules/local/expressionatlas/getdata/main.nf new file mode 100644 index 00000000..1902185f --- /dev/null +++ b/modules/local/expressionatlas/getdata/main.nf @@ -0,0 +1,37 @@ +process EXPRESSIONATLAS_GETDATA { + + label 'process_single' + label 'can_fail' + + tag "$accession" + + maxForks 8 // limiting to 8 threads at a time to avoid 429 errors with the Expression Atlas API server + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/96/963bb5cfef2f27d3c5b2a428b18319c65e4d6ff428be08cf3e124e4f9a25a234/data': + 'community.wave.seqera.io/library/bioconductor-expressionatlas_r-base_r-optparse:e15047a6b3701e2c' }" + + input: + val accession + + output: + path("*.counts.csv"), optional: true, emit: counts + path("*.design.csv"), optional: true, emit: design + tuple val(accession), path("failure_reason.txt"), optional: true, topic: eatlas_failure_reason + tuple val(accession), path("warning_reason.txt"), optional: true, topic: eatlas_warning_reason + tuple val("${task.process}"), val('R'), eval('Rscript -e "cat(R.version.string)" | sed "s/R version //"'), topic: versions + tuple val("${task.process}"), val('ExpressionAtlas'), eval('Rscript -e "cat(as.character(packageVersion(\'ExpressionAtlas\')))"'), topic: versions + + script: + """ + download_eatlas_data.R --accession $accession + """ + + stub: + """ + touch acc.raw.counts.csv + touch acc.design.csv + """ + +} diff --git a/modules/local/extract_gene_ids/environment.yml b/modules/local/extract_gene_ids/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/extract_gene_ids/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/extract_gene_ids/main.nf b/modules/local/extract_gene_ids/main.nf new file mode 100644 index 00000000..962e36ee --- /dev/null +++ b/modules/local/extract_gene_ids/main.nf @@ -0,0 +1,25 @@ +process EXTRACT_GENE_IDS { + + label 'process_low' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file) + + output: + path('*.gene_ids.txt'), optional: true, emit: gene_ids + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + extract_gene_ids.py \\ + --count-file "$count_file" + """ +} diff --git a/modules/local/filter_and_rename_genes/environment.yml b/modules/local/filter_and_rename_genes/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/filter_and_rename_genes/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/filter_and_rename_genes/main.nf b/modules/local/filter_and_rename_genes/main.nf new file mode 100644 index 00000000..e63769a7 --- /dev/null +++ b/modules/local/filter_and_rename_genes/main.nf @@ -0,0 +1,46 @@ +process FILTER_AND_RENAME_GENES { + + label 'process_low' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file) + path gene_id_mapping_file + path valid_gene_ids_file + + output: + tuple val(meta), path('*.renamed.parquet'), optional: true, emit: counts + tuple val(meta.dataset), path("failure_reason.txt"), optional: true, topic: renaming_failure_reason + tuple val(meta.dataset), path("warning_reason.txt"), optional: true, topic: renaming_warning_reason + tuple val(meta.dataset), env("NB_FINAL"), env("NB_MERGED"), env("NB_NOT_VALID"), env("NB_UNMAPPED"), topic: mqc_id_mapping_stats + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def mapping_arg = gene_id_mapping_file ? "--mappings $gene_id_mapping_file" : "" + def valid_ids_arg = valid_gene_ids_file ? "--valid-gene-ids $valid_gene_ids_file" : "" + """ + filter_and_rename_genes.py \\ + --count-file "$count_file" \\ + $mapping_arg \\ + $valid_ids_arg + + NB_UNMAPPED=\$(cat unmapped.txt) + NB_MERGED=\$(cat merged.txt) + NB_NOT_VALID=\$(cat not_valid.txt) + NB_FINAL=\$(cat final.txt) + """ + + + stub: + """ + touch fake_renamed.csv + """ + +} diff --git a/modules/local/filter_out_samples/with_too_many_missing_values/environment.yml b/modules/local/filter_out_samples/with_too_many_missing_values/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/filter_out_samples/with_too_many_missing_values/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/filter_out_samples/with_too_many_missing_values/main.nf b/modules/local/filter_out_samples/with_too_many_missing_values/main.nf new file mode 100644 index 00000000..ebbdc119 --- /dev/null +++ b/modules/local/filter_out_samples/with_too_many_missing_values/main.nf @@ -0,0 +1,36 @@ +process FILTER_OUT_SAMPLES_WITH_TOO_MANY_MISSING_VALUES { + + label 'process_single' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file) + path valid_gene_ids + val max_null_ratio + + output: + tuple val(meta), path("*.nulls_filtered.parquet"), optional: true, emit: counts + path("ratio_null_values_per_sample.csv"), emit: ratio_nulls_per_sample + tuple val(meta.dataset), path("ratio_null_values.csv"), topic: ratio_nulls + tuple val(meta.dataset), env("NB_KEPT_SAMPLES"), env("NB_REJECTED_SAMPLES"), topic: mqc_missing_values_filter_stats + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + filter_out_samples_with_too_many_missing_values.py \\ + --counts $count_file \\ + --valid-gene-ids $valid_gene_ids \\ + --max-null-ratio $max_null_ratio + + NB_REJECTED_SAMPLES=\$(cat nb_rejected_samples.csv) + NB_KEPT_SAMPLES=\$(cat nb_kept_samples.csv) + """ + +} diff --git a/modules/local/filter_out_samples/with_too_many_zeros/environment.yml b/modules/local/filter_out_samples/with_too_many_zeros/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/filter_out_samples/with_too_many_zeros/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/filter_out_samples/with_too_many_zeros/main.nf b/modules/local/filter_out_samples/with_too_many_zeros/main.nf new file mode 100644 index 00000000..c380fb00 --- /dev/null +++ b/modules/local/filter_out_samples/with_too_many_zeros/main.nf @@ -0,0 +1,33 @@ +process FILTER_OUT_SAMPLES_WITH_TOO_MANY_ZEROS { + + label 'process_single' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file) + val(max_zero_ratio) + + output: + tuple val(meta), path("*.zeros_filtered.parquet"), optional: true, emit: counts + tuple val(meta.dataset), path("ratio_zeros.csv"), topic: ratio_zeros + tuple val(meta.dataset), env("NB_KEPT_SAMPLES"), env("NB_REJECTED_SAMPLES"), topic: mqc_zero_values_filter_stats + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + filter_out_samples_with_too_many_zeros.py \\ + --counts $count_file \\ + --max-zero-ratio $max_zero_ratio + + NB_REJECTED_SAMPLES=\$(cat nb_rejected_samples.csv) + NB_KEPT_SAMPLES=\$(cat nb_kept_samples.csv) + """ + +} diff --git a/modules/local/genorm/compute_m_measure/environment.yml b/modules/local/genorm/compute_m_measure/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/genorm/compute_m_measure/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/genorm/compute_m_measure/main.nf b/modules/local/genorm/compute_m_measure/main.nf new file mode 100644 index 00000000..0dadf854 --- /dev/null +++ b/modules/local/genorm/compute_m_measure/main.nf @@ -0,0 +1,28 @@ +process COMPUTE_M_MEASURE { + + tag "${meta.section}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file), path(ratio_files) + + output: + tuple val(meta), path("m_measures.csv"), emit: m_measures + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + def args = "--task-attempts ${task.attempt}" + """ + compute_m_measures.py \\ + --counts $count_file \\ + --std-files "$ratio_files" \\ + $args + """ + +} diff --git a/modules/local/genorm/cross_join/environment.yml b/modules/local/genorm/cross_join/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/genorm/cross_join/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/genorm/cross_join/main.nf b/modules/local/genorm/cross_join/main.nf new file mode 100644 index 00000000..aad36bfa --- /dev/null +++ b/modules/local/genorm/cross_join/main.nf @@ -0,0 +1,31 @@ +process CROSS_JOIN { + + tag "${meta.section} :: ${meta.index_1} vs ${meta.index_2}" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path("count_chunk_file_1"), path("count_chunk_file_2") + + output: + tuple val(meta), path('cross_join.*.parquet'), emit: data + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + + script: + def args = "--task-attempts ${task.attempt}" + """ + make_cross_join.py \\ + --file1 count_chunk_file_1 \\ + --file2 count_chunk_file_2 \\ + --index1 ${meta.index_1} \\ + --index2 ${meta.index_2} \\ + ${args} + """ + +} diff --git a/modules/local/genorm/expression_ratio/environment.yml b/modules/local/genorm/expression_ratio/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/genorm/expression_ratio/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/genorm/expression_ratio/main.nf b/modules/local/genorm/expression_ratio/main.nf new file mode 100644 index 00000000..6a3d1dd6 --- /dev/null +++ b/modules/local/genorm/expression_ratio/main.nf @@ -0,0 +1,28 @@ +process EXPRESSION_RATIO { + + tag "${meta.section} :: ${meta.index_1} vs ${meta.index_2}" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(file) + + output: + tuple val(meta), path('ratios.*.parquet'), emit: data + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + + script: + def args = "--task-attempts ${task.attempt}" + """ + make_pairwise_gene_expression_ratio.py \\ + --file $file \\ + ${args} + """ + +} diff --git a/modules/local/genorm/make_chunks/environment.yml b/modules/local/genorm/make_chunks/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/genorm/make_chunks/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/genorm/make_chunks/main.nf b/modules/local/genorm/make_chunks/main.nf new file mode 100644 index 00000000..e2802482 --- /dev/null +++ b/modules/local/genorm/make_chunks/main.nf @@ -0,0 +1,28 @@ +process MAKE_CHUNKS { + + tag "${meta.section}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file) + + output: + tuple val(meta), path('count_chunk.*.parquet'), emit: chunks + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + + script: + def args = "--task-attempts ${task.attempt}" + """ + make_parquet_chunks.py \\ + --counts $count_file \\ + ${args} + """ + +} diff --git a/modules/local/genorm/ratio_standard_variation/environment.yml b/modules/local/genorm/ratio_standard_variation/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/genorm/ratio_standard_variation/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/genorm/ratio_standard_variation/main.nf b/modules/local/genorm/ratio_standard_variation/main.nf new file mode 100644 index 00000000..f0279938 --- /dev/null +++ b/modules/local/genorm/ratio_standard_variation/main.nf @@ -0,0 +1,28 @@ +process RATIO_STANDARD_VARIATION { + + tag "${meta.section} :: ${meta.index_1} vs ${meta.index_2}" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(file) + + output: + tuple val(meta), path('std.*.parquet'), emit: data + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + + script: + def args = "--task-attempts ${task.attempt}" + """ + get_ratio_standard_variation.py \\ + --file $file \\ + ${args} + """ + +} diff --git a/modules/local/geo/getaccessions/environment.yml b/modules/local/geo/getaccessions/environment.yml new file mode 100644 index 00000000..1071fce5 --- /dev/null +++ b/modules/local/geo/getaccessions/environment.yml @@ -0,0 +1,14 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::pandas==3.0.1 + - conda-forge::httpx==0.28.1 + - conda-forge::tenacity==9.1.4 + - conda-forge::nltk==3.9.2 + - conda-forge::tqdm==4.67.3 + - conda-forge::xmltodict==1.0.3 + - conda-forge::biopython==1.86 diff --git a/modules/local/geo/getaccessions/main.nf b/modules/local/geo/getaccessions/main.nf new file mode 100644 index 00000000..aec9c97d --- /dev/null +++ b/modules/local/geo/getaccessions/main.nf @@ -0,0 +1,75 @@ +process GEO_GETACCESSIONS { + + label 'process_high' + + tag "${species}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/e8/e8be45bdbe57d56f7d452513c4799a878fdfeb2f8ff8351f1c02ee99627dc50e/data': + 'community.wave.seqera.io/library/biopython_httpx_nltk_pandas_pruned:f692df8e1f55b14b' }" + + input: + val species + val keywords + val platform + path excluded_accessions_file + val random_sampling_size + val random_sampling_seed + + output: + path "accessions.txt", optional: true, emit: accessions + path "geo_selected_datasets.metadata.tsv", optional: true, topic: geo_selected_datasets + path "geo_all_datasets.metadata.tsv", optional: true, topic: geo_all_datasets + path "geo_rejected_datasets.metadata.tsv", optional: true, topic: geo_rejected_datasets + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('httpx'), eval('python3 -c "import httpx; print(httpx.__version__)"'), topic: versions + tuple val("${task.process}"), val('nltk'), eval('python3 -c "import nltk; print(nltk.__version__)"'), topic: versions + tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions + tuple val("${task.process}"), val('biopython'), eval('python3 -c "import Bio; print(Bio.__version__)"'), topic: versions + tuple val("${task.process}"), val('tqdm'), eval('python3 -c "import tqdm; print(tqdm.__version__)"'), topic: versions + + script: + def keywords_string = keywords.split(',').collect { it.trim() }.join(' ') + def args = " --species $species" + if ( keywords_string != "" ) { + args += " --keywords $keywords_string" + } + if ( platform ) { + args += " --platform $platform" + } + if ( excluded_accessions_file ) { + args += " --exclude-accessions-in $excluded_accessions_file" + } + if ( random_sampling_size ) { + args += " --random-sampling-size $random_sampling_size" + } + if ( random_sampling_seed ) { + args += " --random-sampling-seed $random_sampling_seed" + } + // the folder where nltk will download data needs to be writable (necessary for singularity) + """ + # limiting CPU usage + export OMP_NUM_THREADS=${task.cpus} + + # the Entrez module from biopython automatically stores temp results in /.config + # if this directory is not writable, the script fails + export HOME=/tmp/biopython + mkdir -p /tmp/biopython + + export NLTK_DATA=\${PWD} + + get_geo_dataset_accessions.py \\ + $args \\ + --cpus ${task.cpus} + """ + + stub: + """ + touch accessions.txt \\ + all_experiments.metadata.tsv \\ + filtered_experiments.metadata.tsv \\ + filtered_experiments.keywords.yaml + """ + +} diff --git a/modules/local/geo/getdata/environment.yml b/modules/local/geo/getdata/environment.yml new file mode 100644 index 00000000..a9e4fb27 --- /dev/null +++ b/modules/local/geo/getdata/environment.yml @@ -0,0 +1,11 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::r-base==4.5.3 + - conda-forge::r-optparse==1.7.5 + - conda-forge::r-dplyr==1.2.0 + - bioconda::bioconductor-geoquery==2.78.0 + - conda-forge::wget==1.25.0 diff --git a/modules/local/geo/getdata/main.nf b/modules/local/geo/getdata/main.nf new file mode 100644 index 00000000..5996862c --- /dev/null +++ b/modules/local/geo/getdata/main.nf @@ -0,0 +1,42 @@ +process GEO_GETDATA { + + label 'process_single' + label 'can_fail' + + tag "$accession" + + maxForks 8 // limiting to 8 threads at a time to avoid 429 errors with the NCBI server + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/2d/2dd2efcca10168936aabe4209344f952df791be9a7530ddbd9e89cdbfc426a7c/data': + 'community.wave.seqera.io/library/bioconductor-geoquery_r-base_r-dplyr_r-optparse_wget:f425756c75602053' }" + + input: + val accession + val species + + output: + path("*.counts.csv"), optional: true, emit: counts + path("*.design.csv"), optional: true, emit: design + path("rejected/**"), optional: true, emit: rejected + tuple val(accession), path("failure_reason.txt"), optional: true, topic: geo_failure_reason + tuple val(accession), path("warning_reason.txt"), optional: true, topic: geo_warning_reason + tuple val("${task.process}"), val('R'), eval('Rscript -e "cat(R.version.string)" | sed "s/R version //"'), topic: versions + tuple val("${task.process}"), val('GEOquery'), eval('Rscript -e "cat(as.character(packageVersion(\'GEOquery\')))"'), topic: versions + tuple val("${task.process}"), val('dplyr'), eval('Rscript -e "cat(as.character(packageVersion(\'dplyr\')))"'), topic: versions + + script: + """ + download_geo_data.R \\ + --accession $accession \\ + --species $species + """ + + stub: + """ + touch acc.microarray.normalised.counts.csv + touch acc.design.csv + """ + +} diff --git a/modules/local/get_candidate_genes/environment.yml b/modules/local/get_candidate_genes/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/get_candidate_genes/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/get_candidate_genes/main.nf b/modules/local/get_candidate_genes/main.nf new file mode 100644 index 00000000..f0d97e92 --- /dev/null +++ b/modules/local/get_candidate_genes/main.nf @@ -0,0 +1,31 @@ +process GET_CANDIDATE_GENES { + + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + path count_file + path stat_file + val nb_candidates_per_section + val nb_sections + + output: + path 'section_*.candidate_counts.parquet', emit: counts + path 'section_*.stats.parquet', emit: section_stats + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + get_candidate_genes.py \\ + --counts $count_file \\ + --stats $stat_file \\ + --nb-candidates-per-section $nb_candidates_per_section \\ + --nb-sections $nb_sections + """ + +} diff --git a/modules/local/gprofiler/idmapping/environment.yml b/modules/local/gprofiler/idmapping/environment.yml new file mode 100644 index 00000000..317e648d --- /dev/null +++ b/modules/local/gprofiler/idmapping/environment.yml @@ -0,0 +1,10 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::pandas==3.0.1 + - conda-forge::httpx==0.28.1 + - conda-forge::tenacity==9.1.4 diff --git a/modules/local/gprofiler/idmapping/main.nf b/modules/local/gprofiler/idmapping/main.nf new file mode 100644 index 00000000..a08392da --- /dev/null +++ b/modules/local/gprofiler/idmapping/main.nf @@ -0,0 +1,53 @@ +process GPROFILER_IDMAPPING { + label 'process_medium' + + tag "${species} IDs to ${gprofiler_target_db}" + + errorStrategy { + if (task.exitStatus == 100 ) { + log.error("Could not map gene IDs to ${gprofiler_target_db} database.") + 'terminate' + } else if (task.exitStatus in ((130..145) + 104 + 175) && task.attempt <= 10) { // OOM & related errors; should be retried as long as memory does not fit + sleep(Math.pow(2, task.attempt) * 200 as long) + 'retry' + } else if (task.attempt <= 3) { // all other errors should be retried with exponential backoff with max retry = 3 + sleep(Math.pow(2, task.attempt) * 200 as long) + 'retry' + } else { // after 3 retries, ignore the error + 'finish' + } + } + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/76/767aed0eb8001eaede58f71b9ca72a658c9ca1929b129ed9cf209a8510541c39/data': + 'community.wave.seqera.io/library/httpx_pandas_python_tenacity:233acc91f7920d99' }" + + input: + path gene_id_file + val species + val gprofiler_target_db + + output: + path('mapped_gene_ids.csv'), emit: mapping + path('gene_metadata.csv'), emit: metadata + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('pandas'), eval('python3 -c "import pandas; print(pandas.__version__)"'), topic: versions + tuple val("${task.process}"), val('httpx'), eval('python3 -c "import httpx; print(httpx.__version__)"'), topic: versions + + script: + """ + gprofiler_map_ids.py \\ + --gene-ids $gene_id_file \\ + --species "$species" \\ + --target-db "$gprofiler_target_db" + """ + + + stub: + """ + touch mapped_gene_ids.csv + touch gene_metadata.csv + """ + +} diff --git a/modules/local/impute_missing_values/environment.yml b/modules/local/impute_missing_values/environment.yml new file mode 100644 index 00000000..7be13b15 --- /dev/null +++ b/modules/local/impute_missing_values/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 + - conda-forge::scikit-learn==1.8.0 diff --git a/modules/local/impute_missing_values/main.nf b/modules/local/impute_missing_values/main.nf new file mode 100644 index 00000000..d2618d0e --- /dev/null +++ b/modules/local/impute_missing_values/main.nf @@ -0,0 +1,27 @@ +process IMPUTE_MISSING_VALUES { + + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/57/5751f4c7c1eb17d92c2863dec2b7505295e56eafb65ea5a9df66876fbffd24e3/data': + 'community.wave.seqera.io/library/polars_python_scikit-learn:041254a8f0633213' }" + + input: + tuple val(meta), path(count_file) + val missing_value_imputer + + output: + tuple val(meta), path('*.imputed.parquet'), emit: counts + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + tuple val("${task.process}"), val('scikit-learn'), eval('python3 -c "import sklearn; print(sklearn.__version__)"'), topic: versions + + script: + """ + impute_missing_values.py \\ + --counts $count_file \\ + --imputer $missing_value_imputer + """ + +} diff --git a/modules/local/merge_counts/environment.yml b/modules/local/merge_counts/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/merge_counts/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/merge_counts/main.nf b/modules/local/merge_counts/main.nf new file mode 100644 index 00000000..d35b428a --- /dev/null +++ b/modules/local/merge_counts/main.nf @@ -0,0 +1,24 @@ +process MERGE_COUNTS { + + label "process_high" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_files, stageAs: "?/*") + + output: + tuple val(meta), path('all_counts.parquet'), emit: counts + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + merge_counts.py \\ + --counts "$count_files" + """ + +} diff --git a/modules/local/normalisation/compute_cpm/environment.yml b/modules/local/normalisation/compute_cpm/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/normalisation/compute_cpm/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/normalisation/compute_cpm/main.nf b/modules/local/normalisation/compute_cpm/main.nf new file mode 100644 index 00000000..005ef619 --- /dev/null +++ b/modules/local/normalisation/compute_cpm/main.nf @@ -0,0 +1,29 @@ +process NORMALISATION_COMPUTE_CPM { + + label 'process_single' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file) + + output: + tuple val(meta), path('*.cpm.parquet'), optional: true, emit: counts + tuple val(meta.dataset), path("failure_reason.txt"), optional: true, topic: normalisation_failure_reason + tuple val(meta.dataset), path("warning_reason.txt"), optional: true, topic: normalisation_warning_reason + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + compute_cpm.py \\ + --counts $count_file + """ + + +} diff --git a/modules/local/normalisation/compute_tpm/environment.yml b/modules/local/normalisation/compute_tpm/environment.yml new file mode 100644 index 00000000..45a97ad0 --- /dev/null +++ b/modules/local/normalisation/compute_tpm/environment.yml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 diff --git a/modules/local/normalisation/compute_tpm/main.nf b/modules/local/normalisation/compute_tpm/main.nf new file mode 100644 index 00000000..d90b35fc --- /dev/null +++ b/modules/local/normalisation/compute_tpm/main.nf @@ -0,0 +1,31 @@ +process NORMALISATION_COMPUTE_TPM { + + label 'process_single' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/00/00f1434368763cebf37466cfaaaf069f971f7eae65b010169975c50d084e5af3/data': + 'community.wave.seqera.io/library/polars_python:1a4a3322c56bfeb9' }" + + input: + tuple val(meta), path(count_file) + path gene_lengths_file + + output: + tuple val(meta), path('*.tpm.parquet'), optional: true, emit: counts + tuple val(meta.dataset), path("failure_reason.txt"), optional: true, topic: normalisation_failure_reason + tuple val(meta.dataset), path("warning_reason.txt"), optional: true, topic: normalisation_warning_reason + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + + script: + """ + compute_tpm.py \\ + --counts $count_file \\ + --gene-lengths $gene_lengths_file + """ + + +} diff --git a/modules/local/normfinder/environment.yml b/modules/local/normfinder/environment.yml new file mode 100644 index 00000000..3d7ed06f --- /dev/null +++ b/modules/local/normfinder/environment.yml @@ -0,0 +1,11 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 + - conda-forge::tqdm==4.67.3 + - conda-forge::numpy==2.4.3 + - conda-forge::numba==0.64.0 diff --git a/modules/local/normfinder/main.nf b/modules/local/normfinder/main.nf new file mode 100644 index 00000000..d3ac7809 --- /dev/null +++ b/modules/local/normfinder/main.nf @@ -0,0 +1,36 @@ +process NORMFINDER { + + tag "${meta.section}" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/05/0526f3dbdd23175430f0af81763c1079b3b1425b2cdb2491ab54bb9c0d93d480/data': + 'community.wave.seqera.io/library/numba_numpy_polars_python_tqdm:f42e9bc9f30a29ff' }" + + input: + tuple val(meta), path(count_file) + path design_file + + output: + tuple val(meta), path('stability_values.normfinder.csv'), emit: stability_values + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + tuple val("${task.process}"), val('tqdm'), eval('python3 -c "import tqdm; print(tqdm.__version__)"'), topic: versions + tuple val("${task.process}"), val('numpy'), eval('python3 -c "import numpy; print(numpy.__version__)"'), topic: versions + tuple val("${task.process}"), val('numba'), eval('python3 -c "import numba; print(numba.__version__)"'), topic: versions + + script: + """ + normfinder.py \\ + --counts $count_file \\ + --design $design_file + """ + + stub: + + """ + touch stability_values.normfinder.csv + """ + +} diff --git a/modules/local/quantile_normalisation/environment.yml b/modules/local/quantile_normalisation/environment.yml new file mode 100644 index 00000000..7be13b15 --- /dev/null +++ b/modules/local/quantile_normalisation/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python=3.14.3 + - conda-forge::polars==1.39.2 + - conda-forge::scikit-learn==1.8.0 diff --git a/modules/local/quantile_normalisation/main.nf b/modules/local/quantile_normalisation/main.nf new file mode 100644 index 00000000..a9106c2e --- /dev/null +++ b/modules/local/quantile_normalisation/main.nf @@ -0,0 +1,34 @@ +process QUANTILE_NORMALISATION { + + label 'process_low' + + tag "${meta.dataset}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/57/5751f4c7c1eb17d92c2863dec2b7505295e56eafb65ea5a9df66876fbffd24e3/data': + 'community.wave.seqera.io/library/polars_python_scikit-learn:041254a8f0633213' }" + + input: + tuple val(meta), path(count_file) + val target_distribution + + output: + tuple val(meta), path('*.quant_norm.parquet'), emit: counts + tuple val("${task.process}"), val('python'), eval("python3 --version | sed 's/Python //'"), topic: versions + tuple val("${task.process}"), val('polars'), eval('python3 -c "import polars; print(polars.__version__)"'), topic: versions + tuple val("${task.process}"), val('scikit-learn'), eval('python3 -c "import sklearn; print(sklearn.__version__)"'), topic: versions + + script: + """ + quantile_normalise.py \\ + --counts $count_file \\ + --target-distrib $target_distribution + """ + + stub: + """ + touch count.cpm.quant_norm.parquet + """ + +} diff --git a/modules/nf-core/multiqc/.conda-lock/linux_amd64-bd-c1f4a7982b743963_1.txt b/modules/nf-core/multiqc/.conda-lock/linux_amd64-bd-c1f4a7982b743963_1.txt new file mode 100644 index 00000000..76190304 --- /dev/null +++ b/modules/nf-core/multiqc/.conda-lock/linux_amd64-bd-c1f4a7982b743963_1.txt @@ -0,0 +1,1552 @@ + +version: 6 +environments: +default: +channels: +- url: https://conda.anaconda.org/conda-forge/ +- url: https://conda.anaconda.org/bioconda/ +- url: https://conda.anaconda.org/bioconda/ +options: +pypi-prerelease-mode: if-necessary-or-explicit +packages: +linux-64: +- conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/_python_abi3_support-1.0-hd8ed1ab_2.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.7.0-pyhd8ed1ab_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/attrs-26.1.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/backports.zstd-1.3.0-py314h680f03e_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.2.0-py314h3de4e8d_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2026.2.25-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.6-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/coloredlogs-15.0.1-pyhd8ed1ab_4.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/colormath-3.0.0-pyhd8ed1ab_4.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.14.3-py314hd8ed1ab_101.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/expat-2.7.4-hecca717_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.17.1-h27c8c51_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-hc364b38_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/humanfriendly-10.0-pyh707e725_8.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/humanize-4.15.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.3-h33c6efd_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.11-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.8.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/jsonschema-4.26.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/jsonschema-specifications-2025.9.1-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/kaleido-core-0.2.1-h3644ca4_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.18-h0c24ade_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/lerc-4.1.0-hdb68285_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.4-hecca717_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype-2.14.3-ha770c72_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype6-2.14.3-h73754d4_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.2-hb03c661_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.55-h421ea60_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.52.0-hf4e2dac_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.7.1-h9d88235_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.6.0-hd42ef1d_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.17.0-h8a09558_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.2-h25fd6f3_2.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/markdown-3.10.2-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-4.0.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.3-py314h67df5f8_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/mathjax-2.7.7-ha770c72_3.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda +- conda: https://conda.anaconda.org/bioconda/noarch/multiqc-1.33-pyhdfd78af_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/narwhals-2.18.1-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/natsort-8.4.0-pyhcf101f3_2.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/networkx-3.6.1-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/nspr-4.38-h29cc59b_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/nss-3.118-h445c969_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.3-py314h2b28147_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.4-h55fea9a_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/pillow-12.1.1-py314h8ec4b1a_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/plotly-6.6.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/polars-1.39.3-pyh58ad624_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/polars-lts-cpu-1.34.0.deprecated-hc364b38_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/polars-runtime-32-1.39.3-py310hffdcd12_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/polars-runtime-compat-1.39.3-py310hbcd5346_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/procps-ng-4.0.6-h18c060e_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-hb9d3cd8_1002.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/pyaml-env-1.2.2-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-2.12.5-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/pydantic-core-2.41.5-py314h2e6c369_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.3-h32b2ec7_101_cp314.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/python-dotenv-1.2.2-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.14.3-h4df99d1_101.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/python-kaleido-0.2.1-pyhd8ed1ab_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/referencing-0.37.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/regex-2026.2.28-py314h5bd0f2a_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.5-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/rich-14.3.3-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/rich-click-1.9.7-pyh8f84b5b_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/rpds-py-0.30.0-py314h2e6c369_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/spectra-0.0.11-pyhd8ed1ab_2.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/sqlite-3.52.0-h04a0ce9_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/tiktoken-0.12.0-py314h67fec18_3.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/tqdm-4.67.3-pyh8f84b5b_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/typeguard-4.5.1-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.15.0-h396c80c_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/typing-inspection-0.4.2-pyhd8ed1ab_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.6.3-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb03c661_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb03c661_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/zlib-ng-2.3.3-hceb46e0_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda +packages: +- conda: https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-20_gnu.conda +build_number: 20 +sha256: 1dd3fffd892081df9726d7eb7e0dea6198962ba775bd88842135a4ddb4deb3c9 +md5: a9f577daf3de00bca7c3c76c0ecbd1de +depends: +- __glibc >=2.17,<3.0.a0 +- libgomp >=7.5.0 +constrains: +- openmp_impl <0.0a0 +license: BSD-3-Clause +license_family: BSD +size: 28948 +timestamp: 1770939786096 +- conda: https://conda.anaconda.org/conda-forge/noarch/_python_abi3_support-1.0-hd8ed1ab_2.conda +sha256: a3967b937b9abf0f2a99f3173fa4630293979bd1644709d89580e7c62a544661 +md5: aaa2a381ccc56eac91d63b6c1240312f +depends: +- cpython +- python-gil +license: MIT +license_family: MIT +size: 8191 +timestamp: 1744137672556 +- conda: https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.7.0-pyhd8ed1ab_1.conda +sha256: e0ea1ba78fbb64f17062601edda82097fcf815012cf52bb704150a2668110d48 +md5: 2934f256a8acfe48f6ebb4fce6cde29c +depends: +- python >=3.9 +- typing-extensions >=4.0.0 +license: MIT +license_family: MIT +size: 18074 +timestamp: 1733247158254 +- conda: https://conda.anaconda.org/conda-forge/noarch/attrs-26.1.0-pyhcf101f3_0.conda +sha256: 1b6124230bb4e571b1b9401537ecff575b7b109cc3a21ee019f65e083b8399ab +md5: c6b0543676ecb1fb2d7643941fe375f2 +depends: +- python >=3.10 +- python +license: MIT +license_family: MIT +size: 64927 +timestamp: 1773935801332 +- conda: https://conda.anaconda.org/conda-forge/noarch/backports.zstd-1.3.0-py314h680f03e_0.conda +noarch: generic +sha256: c31ab719d256bc6f89926131e88ecd0f0c5d003fe8481852c6424f4ec6c7eb29 +md5: a2ac7763a9ac75055b68f325d3255265 +depends: +- python >=3.14 +license: BSD-3-Clause AND MIT AND EPL-2.0 +size: 7514 +timestamp: 1767044983590 +- conda: https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.2.0-py314h3de4e8d_1.conda +sha256: 3ad3500bff54a781c29f16ce1b288b36606e2189d0b0ef2f67036554f47f12b0 +md5: 8910d2c46f7e7b519129f486e0fe927a +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libstdcxx >=14 +- python >=3.14,<3.15.0a0 +- python_abi 3.14.* *_cp314 +constrains: +- libbrotlicommon 1.2.0 hb03c661_1 +license: MIT +license_family: MIT +size: 367376 +timestamp: 1764017265553 +- conda: https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hda65f42_9.conda +sha256: 0b75d45f0bba3e95dc693336fa51f40ea28c980131fec438afb7ce6118ed05f6 +md5: d2ffd7602c02f2b316fd921d39876885 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +license: bzip2-1.0.6 +license_family: BSD +size: 260182 +timestamp: 1771350215188 +- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda +sha256: 67cc7101b36421c5913a1687ef1b99f85b5d6868da3abbf6ec1a4181e79782fc +md5: 4492fd26db29495f0ba23f146cd5638d +depends: +- __unix +license: ISC +size: 147413 +timestamp: 1772006283803 +- conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2026.2.25-pyhd8ed1ab_0.conda +sha256: a6b118fd1ed6099dc4fc03f9c492b88882a780fadaef4ed4f93dc70757713656 +md5: 765c4d97e877cdbbb88ff33152b86125 +depends: +- python >=3.10 +license: ISC +size: 151445 +timestamp: 1772001170301 +- conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.6-pyhd8ed1ab_0.conda +sha256: d86dfd428b2e3c364fa90e07437c8405d635aa4ef54b25ab51d9c712be4112a5 +md5: 49ee13eb9b8f44d63879c69b8a40a74b +depends: +- python >=3.10 +license: MIT +license_family: MIT +size: 58510 +timestamp: 1773660086450 +- conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda +sha256: 38cfe1ee75b21a8361c8824f5544c3866f303af1762693a178266d7f198e8715 +md5: ea8a6c3256897cc31263de9f455e25d9 +depends: +- python >=3.10 +- __unix +- python +license: BSD-3-Clause +license_family: BSD +size: 97676 +timestamp: 1764518652276 +- conda: https://conda.anaconda.org/conda-forge/noarch/coloredlogs-15.0.1-pyhd8ed1ab_4.conda +sha256: 8021c76eeadbdd5784b881b165242db9449783e12ce26d6234060026fd6a8680 +md5: b866ff7007b934d564961066c8195983 +depends: +- humanfriendly >=9.1 +- python >=3.9 +license: MIT +license_family: MIT +size: 43758 +timestamp: 1733928076798 +- conda: https://conda.anaconda.org/conda-forge/noarch/colormath-3.0.0-pyhd8ed1ab_4.conda +sha256: 59c9e29800b483b390467f90e82b0da3a4fbf0612efe1c90813fca232780e160 +md5: 071cf7b0ce333c81718b054066c15102 +depends: +- networkx >=2.0 +- numpy +- python >=3.9 +license: BSD-3-Clause +license_family: BSD +size: 39326 +timestamp: 1735759976140 +- conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.14.3-py314hd8ed1ab_101.conda +noarch: generic +sha256: 91b06300879df746214f7363d6c27c2489c80732e46a369eb2afc234bcafb44c +md5: 3bb89e4f795e5414addaa531d6b1500a +depends: +- python >=3.14,<3.15.0a0 +- python_abi * *_cp314 +license: Python-2.0 +size: 50078 +timestamp: 1770674447292 +- conda: https://conda.anaconda.org/conda-forge/linux-64/expat-2.7.4-hecca717_0.conda +sha256: 0cc345e4dead417996ce9a1f088b28d858f03d113d43c1963d29194366dcce27 +md5: a0535741a4934b3e386051065c58761a +depends: +- __glibc >=2.17,<3.0.a0 +- libexpat 2.7.4 hecca717_0 +- libgcc >=14 +license: MIT +license_family: MIT +size: 145274 +timestamp: 1771259434699 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2 +sha256: 58d7f40d2940dd0a8aa28651239adbf5613254df0f75789919c4e6762054403b +md5: 0c96522c6bdaed4b1566d11387caaf45 +license: BSD-3-Clause +license_family: BSD +size: 397370 +timestamp: 1566932522327 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2 +sha256: c52a29fdac682c20d252facc50f01e7c2e7ceac52aa9817aaf0bb83f7559ec5c +md5: 34893075a5c9e55cdafac56607368fc6 +license: OFL-1.1 +license_family: Other +size: 96530 +timestamp: 1620479909603 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2 +sha256: 00925c8c055a2275614b4d983e1df637245e19058d79fc7dd1a93b8d9fb4b139 +md5: 4d59c254e01d9cde7957100457e2d5fb +license: OFL-1.1 +license_family: Other +size: 700814 +timestamp: 1620479612257 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda +sha256: 2821ec1dc454bd8b9a31d0ed22a7ce22422c0aef163c59f49dfdf915d0f0ca14 +md5: 49023d73832ef61042f6a237cb2687e7 +license: LicenseRef-Ubuntu-Font-Licence-Version-1.0 +license_family: Other +size: 1620504 +timestamp: 1727511233259 +- conda: https://conda.anaconda.org/conda-forge/linux-64/fontconfig-2.17.1-h27c8c51_0.conda +sha256: aa4a44dba97151221100a637c7f4bde619567afade9c0265f8e1c8eed8d7bd8c +md5: 867127763fbe935bab59815b6e0b7b5c +depends: +- __glibc >=2.17,<3.0.a0 +- libexpat >=2.7.4,<3.0a0 +- libfreetype >=2.14.1 +- libfreetype6 >=2.14.1 +- libgcc >=14 +- libuuid >=2.41.3,<3.0a0 +- libzlib >=1.3.1,<2.0a0 +license: MIT +license_family: MIT +size: 270705 +timestamp: 1771382710863 +- conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-hc364b38_1.conda +sha256: 54eea8469786bc2291cc40bca5f46438d3e062a399e8f53f013b6a9f50e98333 +md5: a7970cd949a077b7cb9696379d338681 +depends: +- font-ttf-ubuntu +- font-ttf-inconsolata +- font-ttf-dejavu-sans-mono +- font-ttf-source-code-pro +license: BSD-3-Clause +license_family: BSD +size: 4059 +timestamp: 1762351264405 +- conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda +sha256: 84c64443368f84b600bfecc529a1194a3b14c3656ee2e832d15a20e0329b6da3 +md5: 164fc43f0b53b6e3a7bc7dce5e4f1dc9 +depends: +- python >=3.10 +- hyperframe >=6.1,<7 +- hpack >=4.1,<5 +- python +license: MIT +license_family: MIT +size: 95967 +timestamp: 1756364871835 +- conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda +sha256: 6ad78a180576c706aabeb5b4c8ceb97c0cb25f1e112d76495bff23e3779948ba +md5: 0a802cb9888dd14eeefc611f05c40b6e +depends: +- python >=3.9 +license: MIT +license_family: MIT +size: 30731 +timestamp: 1737618390337 +- conda: https://conda.anaconda.org/conda-forge/noarch/humanfriendly-10.0-pyh707e725_8.conda +sha256: fa2071da7fab758c669e78227e6094f6b3608228740808a6de5d6bce83d9e52d +md5: 7fe569c10905402ed47024fc481bb371 +depends: +- __unix +- python >=3.9 +license: MIT +license_family: MIT +size: 73563 +timestamp: 1733928021866 +- conda: https://conda.anaconda.org/conda-forge/noarch/humanize-4.15.0-pyhd8ed1ab_0.conda +sha256: 6c4343b376d0b12a4c75ab992640970d36c933cad1fd924f6a1181fa91710e80 +md5: daddf757c3ecd6067b9af1df1f25d89e +depends: +- python >=3.10 +license: MIT +license_family: MIT +size: 67994 +timestamp: 1766267728652 +- conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda +sha256: 77af6f5fe8b62ca07d09ac60127a30d9069fdc3c68d6b256754d0ffb1f7779f8 +md5: 8e6923fc12f1fe8f8c4e5c9f343256ac +depends: +- python >=3.9 +license: MIT +license_family: MIT +size: 17397 +timestamp: 1737618427549 +- conda: https://conda.anaconda.org/conda-forge/linux-64/icu-78.3-h33c6efd_0.conda +sha256: fbf86c4a59c2ed05bbffb2ba25c7ed94f6185ec30ecb691615d42342baa1a16a +md5: c80d8a3b84358cb967fa81e7075fbc8a +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libstdcxx >=14 +license: MIT +license_family: MIT +size: 12723451 +timestamp: 1773822285671 +- conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.11-pyhd8ed1ab_0.conda +sha256: ae89d0299ada2a3162c2614a9d26557a92aa6a77120ce142f8e0109bbf0342b0 +md5: 53abe63df7e10a6ba605dc5f9f961d36 +depends: +- python >=3.10 +license: BSD-3-Clause +license_family: BSD +size: 50721 +timestamp: 1760286526795 +- conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.8.0-pyhcf101f3_0.conda +sha256: 82ab2a0d91ca1e7e63ab6a4939356667ef683905dea631bc2121aa534d347b16 +md5: 080594bf4493e6bae2607e65390c520a +depends: +- python >=3.10 +- zipp >=3.20 +- python +license: Apache-2.0 +license_family: APACHE +size: 34387 +timestamp: 1773931568510 +- conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhcf101f3_1.conda +sha256: fc9ca7348a4f25fed2079f2153ecdcf5f9cf2a0bc36c4172420ca09e1849df7b +md5: 04558c96691bed63104678757beb4f8d +depends: +- markupsafe >=2.0 +- python >=3.10 +- python +license: BSD-3-Clause +license_family: BSD +size: 120685 +timestamp: 1764517220861 +- conda: https://conda.anaconda.org/conda-forge/noarch/jsonschema-4.26.0-pyhcf101f3_0.conda +sha256: db973a37d75db8e19b5f44bbbdaead0c68dde745407f281e2a7fe4db74ec51d7 +md5: ada41c863af263cc4c5fcbaff7c3e4dc +depends: +- attrs >=22.2.0 +- jsonschema-specifications >=2023.3.6 +- python >=3.10 +- referencing >=0.28.4 +- rpds-py >=0.25.0 +- python +license: MIT +license_family: MIT +size: 82356 +timestamp: 1767839954256 +- conda: https://conda.anaconda.org/conda-forge/noarch/jsonschema-specifications-2025.9.1-pyhcf101f3_0.conda +sha256: 0a4f3b132f0faca10c89fdf3b60e15abb62ded6fa80aebfc007d05965192aa04 +md5: 439cd0f567d697b20a8f45cb70a1005a +depends: +- python >=3.10 +- referencing >=0.31.0 +- python +license: MIT +license_family: MIT +size: 19236 +timestamp: 1757335715225 +- conda: https://conda.anaconda.org/conda-forge/linux-64/kaleido-core-0.2.1-h3644ca4_0.tar.bz2 +sha256: 7f243680ca03eba7457b7a48f93a9440ba8181a8eac20a3eb5ef165ab6c96664 +md5: b3723b235b0758abaae8c82ce4d80146 +depends: +- __glibc >=2.17,<3.0.a0 +- expat >=2.2.10,<3.0.0a0 +- fontconfig +- fonts-conda-forge +- libgcc-ng >=9.3.0 +- mathjax 2.7.* +- nspr >=4.29,<5.0a0 +- nss >=3.62,<4.0a0 +- sqlite >=3.34.0,<4.0a0 +license: MIT +license_family: MIT +size: 62099926 +timestamp: 1615199463039 +- conda: https://conda.anaconda.org/conda-forge/linux-64/lcms2-2.18-h0c24ade_0.conda +sha256: 836ec4b895352110335b9fdcfa83a8dcdbe6c5fb7c06c4929130600caea91c0a +md5: 6f2e2c8f58160147c4d1c6f4c14cbac4 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libjpeg-turbo >=3.1.2,<4.0a0 +- libtiff >=4.7.1,<4.8.0a0 +license: MIT +license_family: MIT +size: 249959 +timestamp: 1768184673131 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.45.1-default_hbd61a6d_102.conda +sha256: 3d584956604909ff5df353767f3a2a2f60e07d070b328d109f30ac40cd62df6c +md5: 18335a698559cdbcd86150a48bf54ba6 +depends: +- __glibc >=2.17,<3.0.a0 +- zstd >=1.5.7,<1.6.0a0 +constrains: +- binutils_impl_linux-64 2.45.1 +license: GPL-3.0-only +license_family: GPL +size: 728002 +timestamp: 1774197446916 +- conda: https://conda.anaconda.org/conda-forge/linux-64/lerc-4.1.0-hdb68285_0.conda +sha256: f84cb54782f7e9cea95e810ea8fef186e0652d0fa73d3009914fa2c1262594e1 +md5: a752488c68f2e7c456bcbd8f16eec275 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libstdcxx >=14 +license: Apache-2.0 +license_family: Apache +size: 261513 +timestamp: 1773113328888 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libblas-3.11.0-5_h4a7cf45_openblas.conda +build_number: 5 +sha256: 18c72545080b86739352482ba14ba2c4815e19e26a7417ca21a95b76ec8da24c +md5: c160954f7418d7b6e87eaf05a8913fa9 +depends: +- libopenblas >=0.3.30,<0.3.31.0a0 +- libopenblas >=0.3.30,<1.0a0 +constrains: +- mkl <2026 +- liblapack 3.11.0 5*_openblas +- libcblas 3.11.0 5*_openblas +- blas 2.305 openblas +- liblapacke 3.11.0 5*_openblas +license: BSD-3-Clause +license_family: BSD +size: 18213 +timestamp: 1765818813880 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.11.0-5_h0358290_openblas.conda +build_number: 5 +sha256: 0cbdcc67901e02dc17f1d19e1f9170610bd828100dc207de4d5b6b8ad1ae7ad8 +md5: 6636a2b6f1a87572df2970d3ebc87cc0 +depends: +- libblas 3.11.0 5_h4a7cf45_openblas +constrains: +- liblapacke 3.11.0 5*_openblas +- blas 2.305 openblas +- liblapack 3.11.0 5*_openblas +license: BSD-3-Clause +license_family: BSD +size: 18194 +timestamp: 1765818837135 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.25-h17f619e_0.conda +sha256: aa8e8c4be9a2e81610ddf574e05b64ee131fab5e0e3693210c9d6d2fba32c680 +md5: 6c77a605a7a689d17d4819c0f8ac9a00 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +license: MIT +license_family: MIT +size: 73490 +timestamp: 1761979956660 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.7.4-hecca717_0.conda +sha256: d78f1d3bea8c031d2f032b760f36676d87929b18146351c4464c66b0869df3f5 +md5: e7f7ce06ec24cfcfb9e36d28cf82ba57 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +constrains: +- expat 2.7.4.* +license: MIT +license_family: MIT +size: 76798 +timestamp: 1771259418166 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libffi-3.5.2-h3435931_0.conda +sha256: 31f19b6a88ce40ebc0d5a992c131f57d919f73c0b92cd1617a5bec83f6e961e6 +md5: a360c33a5abe61c07959e449fa1453eb +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +license: MIT +license_family: MIT +size: 58592 +timestamp: 1769456073053 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype-2.14.3-ha770c72_0.conda +sha256: 38f014a7129e644636e46064ecd6b1945e729c2140e21d75bb476af39e692db2 +md5: e289f3d17880e44b633ba911d57a321b +depends: +- libfreetype6 >=2.14.3 +license: GPL-2.0-only OR FTL +size: 8049 +timestamp: 1774298163029 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libfreetype6-2.14.3-h73754d4_0.conda +sha256: 16f020f96da79db1863fcdd8f2b8f4f7d52f177dd4c58601e38e9182e91adf1d +md5: fb16b4b69e3f1dcfe79d80db8fd0c55d +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libpng >=1.6.55,<1.7.0a0 +- libzlib >=1.3.2,<2.0a0 +constrains: +- freetype >=2.14.3 +license: GPL-2.0-only OR FTL +size: 384575 +timestamp: 1774298162622 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-15.2.0-he0feb66_18.conda +sha256: faf7d2017b4d718951e3a59d081eb09759152f93038479b768e3d612688f83f5 +md5: 0aa00f03f9e39fb9876085dee11a85d4 +depends: +- __glibc >=2.17,<3.0.a0 +- _openmp_mutex >=4.5 +constrains: +- libgcc-ng ==15.2.0=*_18 +- libgomp 15.2.0 he0feb66_18 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 1041788 +timestamp: 1771378212382 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-15.2.0-h69a702a_18.conda +sha256: e318a711400f536c81123e753d4c797a821021fb38970cebfb3f454126016893 +md5: d5e96b1ed75ca01906b3d2469b4ce493 +depends: +- libgcc 15.2.0 he0feb66_18 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 27526 +timestamp: 1771378224552 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran-15.2.0-h69a702a_18.conda +sha256: d2c9fad338fd85e4487424865da8e74006ab2e2475bd788f624d7a39b2a72aee +md5: 9063115da5bc35fdc3e1002e69b9ef6e +depends: +- libgfortran5 15.2.0 h68bc16d_18 +constrains: +- libgfortran-ng ==15.2.0=*_18 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 27523 +timestamp: 1771378269450 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-15.2.0-h68bc16d_18.conda +sha256: 539b57cf50ec85509a94ba9949b7e30717839e4d694bc94f30d41c9d34de2d12 +md5: 646855f357199a12f02a87382d429b75 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=15.2.0 +constrains: +- libgfortran 15.2.0 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 2482475 +timestamp: 1771378241063 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libgomp-15.2.0-he0feb66_18.conda +sha256: 21337ab58e5e0649d869ab168d4e609b033509de22521de1bfed0c031bfc5110 +md5: 239c5e9546c38a1e884d69effcf4c882 +depends: +- __glibc >=2.17,<3.0.a0 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 603262 +timestamp: 1771378117851 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libjpeg-turbo-3.1.2-hb03c661_0.conda +sha256: cc9aba923eea0af8e30e0f94f2ad7156e2984d80d1e8e7fe6be5a1f257f0eb32 +md5: 8397539e3a0bbd1695584fb4f927485a +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +constrains: +- jpeg <0.0.0a +license: IJG AND BSD-3-Clause AND Zlib +size: 633710 +timestamp: 1762094827865 +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.11.0-5_h47877c9_openblas.conda +build_number: 5 +sha256: c723b6599fcd4c6c75dee728359ef418307280fa3e2ee376e14e85e5bbdda053 +md5: b38076eb5c8e40d0106beda6f95d7609 +depends: +- libblas 3.11.0 5_h4a7cf45_openblas +constrains: +- blas 2.305 openblas +- liblapacke 3.11.0 5*_openblas +- libcblas 3.11.0 5*_openblas +license: BSD-3-Clause +license_family: BSD +size: 18200 +timestamp: 1765818857876 +- conda: https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.8.2-hb03c661_0.conda +sha256: 755c55ebab181d678c12e49cced893598f2bab22d582fbbf4d8b83c18be207eb +md5: c7c83eecbb72d88b940c249af56c8b17 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +constrains: +- xz 5.8.2.* +license: 0BSD +size: 113207 +timestamp: 1768752626120 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-hb03c661_1.conda +sha256: fe171ed5cf5959993d43ff72de7596e8ac2853e9021dec0344e583734f1e0843 +md5: 2c21e66f50753a083cbe6b80f38268fa +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +license: BSD-2-Clause +license_family: BSD +size: 92400 +timestamp: 1769482286018 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.30-pthreads_h94d23a6_4.conda +sha256: 199d79c237afb0d4780ccd2fbf829cea80743df60df4705202558675e07dd2c5 +md5: be43915efc66345cccb3c310b6ed0374 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libgfortran +- libgfortran5 >=14.3.0 +constrains: +- openblas >=0.3.30,<0.3.31.0a0 +license: BSD-3-Clause +license_family: BSD +size: 5927939 +timestamp: 1763114673331 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libpng-1.6.55-h421ea60_0.conda +sha256: 36ade759122cdf0f16e2a2562a19746d96cf9c863ffaa812f2f5071ebbe9c03c +md5: 5f13ffc7d30ffec87864e678df9957b4 +depends: +- libgcc >=14 +- __glibc >=2.17,<3.0.a0 +- libzlib >=1.3.1,<2.0a0 +license: zlib-acknowledgement +size: 317669 +timestamp: 1770691470744 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.52.0-hf4e2dac_0.conda +sha256: d716847b7deca293d2e49ed1c8ab9e4b9e04b9d780aea49a97c26925b28a7993 +md5: fd893f6a3002a635b5e50ceb9dd2c0f4 +depends: +- __glibc >=2.17,<3.0.a0 +- icu >=78.2,<79.0a0 +- libgcc >=14 +- libzlib >=1.3.1,<2.0a0 +license: blessing +size: 951405 +timestamp: 1772818874251 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-15.2.0-h934c35e_18.conda +sha256: 78668020064fdaa27e9ab65cd2997e2c837b564ab26ce3bf0e58a2ce1a525c6e +md5: 1b08cd684f34175e4514474793d44bcb +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc 15.2.0 he0feb66_18 +constrains: +- libstdcxx-ng ==15.2.0=*_18 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 5852330 +timestamp: 1771378262446 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libtiff-4.7.1-h9d88235_1.conda +sha256: e5f8c38625aa6d567809733ae04bb71c161a42e44a9fa8227abe61fa5c60ebe0 +md5: cd5a90476766d53e901500df9215e927 +depends: +- __glibc >=2.17,<3.0.a0 +- lerc >=4.0.0,<5.0a0 +- libdeflate >=1.25,<1.26.0a0 +- libgcc >=14 +- libjpeg-turbo >=3.1.0,<4.0a0 +- liblzma >=5.8.1,<6.0a0 +- libstdcxx >=14 +- libwebp-base >=1.6.0,<2.0a0 +- libzlib >=1.3.1,<2.0a0 +- zstd >=1.5.7,<1.6.0a0 +license: HPND +size: 435273 +timestamp: 1762022005702 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.41.3-h5347b49_0.conda +sha256: 1a7539cfa7df00714e8943e18de0b06cceef6778e420a5ee3a2a145773758aee +md5: db409b7c1720428638e7c0d509d3e1b5 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +license: BSD-3-Clause +license_family: BSD +size: 40311 +timestamp: 1766271528534 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libwebp-base-1.6.0-hd42ef1d_0.conda +sha256: 3aed21ab28eddffdaf7f804f49be7a7d701e8f0e46c856d801270b470820a37b +md5: aea31d2e5b1091feca96fcfe945c3cf9 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +constrains: +- libwebp 1.6.0 +license: BSD-3-Clause +license_family: BSD +size: 429011 +timestamp: 1752159441324 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libxcb-1.17.0-h8a09558_0.conda +sha256: 666c0c431b23c6cec6e492840b176dde533d48b7e6fb8883f5071223433776aa +md5: 92ed62436b625154323d40d5f2f11dd7 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=13 +- pthread-stubs +- xorg-libxau >=1.0.11,<2.0a0 +- xorg-libxdmcp +license: MIT +license_family: MIT +size: 395888 +timestamp: 1727278577118 +- conda: https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.2-h25fd6f3_2.conda +sha256: 55044c403570f0dc26e6364de4dc5368e5f3fc7ff103e867c487e2b5ab2bcda9 +md5: d87ff7921124eccd67248aa483c23fec +depends: +- __glibc >=2.17,<3.0.a0 +constrains: +- zlib 1.3.2 *_2 +license: Zlib +license_family: Other +size: 63629 +timestamp: 1774072609062 +- conda: https://conda.anaconda.org/conda-forge/noarch/markdown-3.10.2-pyhcf101f3_0.conda +sha256: 20e0892592a3e7c683e3d66df704a9425d731486a97c34fc56af4da1106b2b6b +md5: ba0a9221ce1063f31692c07370d062f3 +depends: +- importlib-metadata >=4.4 +- python >=3.10 +- python +license: BSD-3-Clause +license_family: BSD +size: 85893 +timestamp: 1770694658918 +- conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-4.0.0-pyhd8ed1ab_0.conda +sha256: 7b1da4b5c40385791dbc3cc85ceea9fad5da680a27d5d3cb8bfaa185e304a89e +md5: 5b5203189eb668f042ac2b0826244964 +depends: +- mdurl >=0.1,<1 +- python >=3.10 +license: MIT +license_family: MIT +size: 64736 +timestamp: 1754951288511 +- conda: https://conda.anaconda.org/conda-forge/linux-64/markupsafe-3.0.3-py314h67df5f8_1.conda +sha256: c279be85b59a62d5c52f5dd9a4cd43ebd08933809a8416c22c3131595607d4cf +md5: 9a17c4307d23318476d7fbf0fedc0cde +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- python >=3.14,<3.15.0a0 +- python_abi 3.14.* *_cp314 +constrains: +- jinja2 >=3.0.0 +license: BSD-3-Clause +license_family: BSD +size: 27424 +timestamp: 1772445227915 +- conda: https://conda.anaconda.org/conda-forge/linux-64/mathjax-2.7.7-ha770c72_3.tar.bz2 +sha256: 02fef69bde69db264a12f21386612262f545b6e3e68d8f1ccec19f3eaae58edf +md5: 86e69bd82c2a2c6fd29f5ab7e02b3691 +license: Apache-2.0 +license_family: Apache +size: 22281629 +timestamp: 1662784498331 +- conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda +sha256: 78c1bbe1723449c52b7a9df1af2ee5f005209f67e40b6e1d3c7619127c43b1c7 +md5: 592132998493b3ff25fd7479396e8351 +depends: +- python >=3.9 +license: MIT +license_family: MIT +size: 14465 +timestamp: 1733255681319 +- conda: https://conda.anaconda.org/bioconda/noarch/multiqc-1.33-pyhdfd78af_0.conda +sha256: f005760b13093362fc9c997d603dd487de32ab2e821a3cbce52a42bcb8136517 +md5: 698a8a27c2b9d8a542c70cb47099a75e +depends: +- click +- coloredlogs +- humanize +- importlib-metadata +- jinja2 >=3.0.0 +- jsonschema +- markdown +- natsort +- numpy +- packaging +- pillow >=10.2.0 +- plotly >=5.18 +- polars-lts-cpu +- pyaml-env +- pydantic >=2.7.1 +- python >=3.8,!=3.14.1 +- python-dotenv +- python-kaleido 0.2.1 +- pyyaml >=4 +- requests +- rich >=10 +- rich-click +- spectra >=0.0.10 +- tiktoken +- tqdm +- typeguard +license: GPL-3.0-or-later +license_family: GPL3 +size: 4198799 +timestamp: 1765300743879 +- conda: https://conda.anaconda.org/conda-forge/noarch/narwhals-2.18.1-pyhcf101f3_1.conda +sha256: 541fd4390a0687228b8578247f1536a821d9261389a65585af9d1a6f2a14e1e0 +md5: 30bec5e8f4c3969e2b1bd407c5e52afb +depends: +- python >=3.10 +- python +license: MIT +size: 280459 +timestamp: 1774380620329 +- conda: https://conda.anaconda.org/conda-forge/noarch/natsort-8.4.0-pyhcf101f3_2.conda +sha256: aeb1548eb72e4f198e72f19d242fb695b35add2ac7b2c00e0d83687052867680 +md5: e941e85e273121222580723010bd4fa2 +depends: +- python >=3.9 +- python +license: MIT +license_family: MIT +size: 39262 +timestamp: 1770905275632 +- conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h2d0b736_3.conda +sha256: 3fde293232fa3fca98635e1167de6b7c7fda83caf24b9d6c91ec9eefb4f4d586 +md5: 47e340acb35de30501a76c7c799c41d7 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=13 +license: X11 AND BSD-3-Clause +size: 891641 +timestamp: 1738195959188 +- conda: https://conda.anaconda.org/conda-forge/noarch/networkx-3.6.1-pyhcf101f3_0.conda +sha256: f6a82172afc50e54741f6f84527ef10424326611503c64e359e25a19a8e4c1c6 +md5: a2c1eeadae7a309daed9d62c96012a2b +depends: +- python >=3.11 +- python +constrains: +- numpy >=1.25 +- scipy >=1.11.2 +- matplotlib-base >=3.8 +- pandas >=2.0 +license: BSD-3-Clause +license_family: BSD +size: 1587439 +timestamp: 1765215107045 +- conda: https://conda.anaconda.org/conda-forge/linux-64/nspr-4.38-h29cc59b_0.conda +sha256: e3664264bd936c357523b55c71ed5a30263c6ba278d726a75b1eb112e6fb0b64 +md5: e235d5566c9cc8970eb2798dd4ecf62f +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libstdcxx >=14 +license: MPL-2.0 +license_family: MOZILLA +size: 228588 +timestamp: 1762348634537 +- conda: https://conda.anaconda.org/conda-forge/linux-64/nss-3.118-h445c969_0.conda +sha256: 44dd98ffeac859d84a6dcba79a2096193a42fc10b29b28a5115687a680dd6aea +md5: 567fbeed956c200c1db5782a424e58ee +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libsqlite >=3.51.0,<4.0a0 +- libstdcxx >=14 +- libzlib >=1.3.1,<2.0a0 +- nspr >=4.38,<5.0a0 +license: MPL-2.0 +license_family: MOZILLA +size: 2057773 +timestamp: 1763485556350 +- conda: https://conda.anaconda.org/conda-forge/linux-64/numpy-2.4.3-py314h2b28147_0.conda +sha256: f2ba8cb0d86a6461a6bcf0d315c80c7076083f72c6733c9290086640723f79ec +md5: 36f5b7eb328bdc204954a2225cf908e2 +depends: +- python +- libstdcxx >=14 +- libgcc >=14 +- __glibc >=2.17,<3.0.a0 +- python_abi 3.14.* *_cp314 +- libcblas >=3.9.0,<4.0a0 +- liblapack >=3.9.0,<4.0a0 +- libblas >=3.9.0,<4.0a0 +constrains: +- numpy-base <0a0 +license: BSD-3-Clause +license_family: BSD +size: 8927860 +timestamp: 1773839233468 +- conda: https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.4-h55fea9a_0.conda +sha256: 3900f9f2dbbf4129cf3ad6acf4e4b6f7101390b53843591c53b00f034343bc4d +md5: 11b3379b191f63139e29c0d19dee24cd +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libpng >=1.6.50,<1.7.0a0 +- libstdcxx >=14 +- libtiff >=4.7.1,<4.8.0a0 +- libzlib >=1.3.1,<2.0a0 +license: BSD-2-Clause +license_family: BSD +size: 355400 +timestamp: 1758489294972 +- conda: https://conda.anaconda.org/conda-forge/linux-64/openssl-3.6.1-h35e630c_1.conda +sha256: 44c877f8af015332a5d12f5ff0fb20ca32f896526a7d0cdb30c769df1144fb5c +md5: f61eb8cd60ff9057122a3d338b99c00f +depends: +- __glibc >=2.17,<3.0.a0 +- ca-certificates +- libgcc >=14 +license: Apache-2.0 +license_family: Apache +size: 3164551 +timestamp: 1769555830639 +- conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda +sha256: c1fc0f953048f743385d31c468b4a678b3ad20caffdeaa94bed85ba63049fd58 +md5: b76541e68fea4d511b1ac46a28dcd2c6 +depends: +- python >=3.8 +- python +license: Apache-2.0 +license_family: APACHE +size: 72010 +timestamp: 1769093650580 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pillow-12.1.1-py314h8ec4b1a_0.conda +sha256: 9e6ec8f3213e8b7d64b0ad45f84c51a2c9eba4398efda31e196c9a56186133ee +md5: 79678378ae235e24b3aa83cee1b38207 +depends: +- python +- libgcc >=14 +- __glibc >=2.17,<3.0.a0 +- libwebp-base >=1.6.0,<2.0a0 +- zlib-ng >=2.3.3,<2.4.0a0 +- python_abi 3.14.* *_cp314 +- tk >=8.6.13,<8.7.0a0 +- libjpeg-turbo >=3.1.2,<4.0a0 +- libxcb >=1.17.0,<2.0a0 +- openjpeg >=2.5.4,<3.0a0 +- lcms2 >=2.18,<3.0a0 +- libtiff >=4.7.1,<4.8.0a0 +- libfreetype >=2.14.1 +- libfreetype6 >=2.14.1 +license: HPND +size: 1073026 +timestamp: 1770794002408 +- conda: https://conda.anaconda.org/conda-forge/noarch/plotly-6.6.0-pyhd8ed1ab_0.conda +sha256: c418d325359fc7a0074cea7f081ef1bce26e114d2da8a0154c5d27ecc87a08e7 +md5: 3e9427ee186846052e81fadde8ebe96a +depends: +- narwhals >=1.15.1 +- packaging +- python >=3.10 +constrains: +- ipywidgets >=7.6 +license: MIT +license_family: MIT +size: 5251872 +timestamp: 1772628857717 +- conda: https://conda.anaconda.org/conda-forge/noarch/polars-1.39.3-pyh58ad624_1.conda +sha256: d332c2d5002fc440ae37ed9679ffc21b552f18d20232390005d1dd3bce0888d3 +md5: d5a4e013a30dd8dfde9ab39f45aaf9c1 +depends: +- polars-runtime-32 ==1.39.3 +- python >=3.10 +- python +constrains: +- numpy >=1.16.0 +- pyarrow >=7.0.0 +- fastexcel >=0.9 +- openpyxl >=3.0.0 +- xlsx2csv >=0.8.0 +- connectorx >=0.3.2 +- deltalake >=1.0.0 +- pyiceberg >=0.7.1 +- altair >=5.4.0 +- great_tables >=0.8.0 +- polars-runtime-32 ==1.39.3 +- polars-runtime-64 ==1.39.3 +- polars-runtime-compat ==1.39.3 +license: MIT +license_family: MIT +size: 533495 +timestamp: 1774207987966 +- conda: https://conda.anaconda.org/conda-forge/noarch/polars-lts-cpu-1.34.0.deprecated-hc364b38_0.conda +sha256: e466fb31f67ba9bde18deafeb34263ca5eb25807f39ead0e9d753a8e82c4c4f4 +md5: ef0340e75068ac8ff96462749b5c98e7 +depends: +- polars >=1.34.0 +- polars-runtime-compat >=1.34.0 +license: MIT +license_family: MIT +size: 3902 +timestamp: 1760206808444 +- conda: https://conda.anaconda.org/conda-forge/linux-64/polars-runtime-32-1.39.3-py310hffdcd12_1.conda +noarch: python +sha256: 9744f8086bb0832998f5b01076f57ddc9efbe460e493b14303c3567dc4f401e7 +md5: f9327f9f2cfc4215f55b613e64afd3ba +depends: +- python +- libstdcxx >=14 +- libgcc >=14 +- __glibc >=2.17,<3.0.a0 +- _python_abi3_support 1.* +- cpython >=3.10 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 37570276 +timestamp: 1774207987966 +- conda: https://conda.anaconda.org/conda-forge/linux-64/polars-runtime-compat-1.39.3-py310hbcd5346_1.conda +noarch: python +sha256: bf0b932713f0f27924f42159c98426e0073bb6145ed796eaa4cec79ca05363c7 +md5: 4b9b312453eebd6fbdbbe2a88fa1b5c4 +depends: +- python +- libgcc >=14 +- libstdcxx >=14 +- __glibc >=2.17,<3.0.a0 +- _python_abi3_support 1.* +- cpython >=3.10 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 37224264 +timestamp: 1774207985377 +- conda: https://conda.anaconda.org/conda-forge/linux-64/procps-ng-4.0.6-h18c060e_0.conda +sha256: 4ce2e1ee31a6217998f78c31ce7dc0a3e0557d9238b51d49dd20c52d467a126d +md5: f2c23a77b25efcad57d377b34bd84941 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- ncurses >=6.5,<7.0a0 +license: GPL-2.0-or-later AND LGPL-2.0-or-later +license_family: GPL +size: 593603 +timestamp: 1769710381284 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pthread-stubs-0.4-hb9d3cd8_1002.conda +sha256: 9c88f8c64590e9567c6c80823f0328e58d3b1efb0e1c539c0315ceca764e0973 +md5: b3c17d95b5a10c6e64a21fa17573e70e +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=13 +license: MIT +license_family: MIT +size: 8252 +timestamp: 1726802366959 +- conda: https://conda.anaconda.org/conda-forge/noarch/pyaml-env-1.2.2-pyhd8ed1ab_0.conda +sha256: 58994e0d2ea8584cb399546e6f6896d771995e6121d1a7b6a2c9948388358932 +md5: e17be1016bcc3516827b836cd3e4d9dc +depends: +- python >=3.9 +- pyyaml >=5.0,<=7.0 +license: MIT +license_family: MIT +size: 14645 +timestamp: 1736766960536 +- conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-2.12.5-pyhcf101f3_1.conda +sha256: 868569d9505b7fe246c880c11e2c44924d7613a8cdcc1f6ef85d5375e892f13d +md5: c3946ed24acdb28db1b5d63321dbca7d +depends: +- typing-inspection >=0.4.2 +- typing_extensions >=4.14.1 +- python >=3.10 +- typing-extensions >=4.6.1 +- annotated-types >=0.6.0 +- pydantic-core ==2.41.5 +- python +license: MIT +license_family: MIT +size: 340482 +timestamp: 1764434463101 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pydantic-core-2.41.5-py314h2e6c369_1.conda +sha256: 7e0ae379796e28a429f8e48f2fe22a0f232979d65ec455e91f8dac689247d39f +md5: 432b0716a1dfac69b86aa38fdd59b7e6 +depends: +- python +- typing-extensions >=4.6.0,!=4.7.0 +- libgcc >=14 +- __glibc >=2.17,<3.0.a0 +- python_abi 3.14.* *_cp314 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 1943088 +timestamp: 1762988995556 +- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda +sha256: 5577623b9f6685ece2697c6eb7511b4c9ac5fb607c9babc2646c811b428fd46a +md5: 6b6ece66ebcae2d5f326c77ef2c5a066 +depends: +- python >=3.9 +license: BSD-2-Clause +license_family: BSD +size: 889287 +timestamp: 1750615908735 +- conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda +sha256: ba3b032fa52709ce0d9fd388f63d330a026754587a2f461117cac9ab73d8d0d8 +md5: 461219d1a5bd61342293efa2c0c90eac +depends: +- __unix +- python >=3.9 +license: BSD-3-Clause +license_family: BSD +size: 21085 +timestamp: 1733217331982 +- conda: https://conda.anaconda.org/conda-forge/linux-64/python-3.14.3-h32b2ec7_101_cp314.conda +build_number: 101 +sha256: cb0628c5f1732f889f53a877484da98f5a0e0f47326622671396fb4f2b0cd6bd +md5: c014ad06e60441661737121d3eae8a60 +depends: +- __glibc >=2.17,<3.0.a0 +- bzip2 >=1.0.8,<2.0a0 +- ld_impl_linux-64 >=2.36.1 +- libexpat >=2.7.3,<3.0a0 +- libffi >=3.5.2,<3.6.0a0 +- libgcc >=14 +- liblzma >=5.8.2,<6.0a0 +- libmpdec >=4.0.0,<5.0a0 +- libsqlite >=3.51.2,<4.0a0 +- libuuid >=2.41.3,<3.0a0 +- libzlib >=1.3.1,<2.0a0 +- ncurses >=6.5,<7.0a0 +- openssl >=3.5.5,<4.0a0 +- python_abi 3.14.* *_cp314 +- readline >=8.3,<9.0a0 +- tk >=8.6.13,<8.7.0a0 +- tzdata +- zstd >=1.5.7,<1.6.0a0 +license: Python-2.0 +size: 36702440 +timestamp: 1770675584356 +python_site_packages_path: lib/python3.14/site-packages +- conda: https://conda.anaconda.org/conda-forge/noarch/python-dotenv-1.2.2-pyhcf101f3_0.conda +sha256: 74e417a768f59f02a242c25e7db0aa796627b5bc8c818863b57786072aeb85e5 +md5: 130584ad9f3a513cdd71b1fdc1244e9c +depends: +- python >=3.10 +license: BSD-3-Clause +license_family: BSD +size: 27848 +timestamp: 1772388605021 +- conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.14.3-h4df99d1_101.conda +sha256: 233aebd94c704ac112afefbb29cf4170b7bc606e22958906f2672081bc50638a +md5: 235765e4ea0d0301c75965985163b5a1 +depends: +- cpython 3.14.3.* +- python_abi * *_cp314 +license: Python-2.0 +size: 50062 +timestamp: 1770674497152 +- conda: https://conda.anaconda.org/conda-forge/noarch/python-kaleido-0.2.1-pyhd8ed1ab_0.tar.bz2 +sha256: e17bf63a30aec33432f1ead86e15e9febde9fc40a7f869c0e766be8d2db44170 +md5: 310259a5b03ff02289d7705f39e2b1d2 +depends: +- kaleido-core 0.2.1.* +- python >=3.5 +license: MIT +license_family: MIT +size: 18320 +timestamp: 1615204747600 +- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda +build_number: 8 +sha256: ad6d2e9ac39751cc0529dd1566a26751a0bf2542adb0c232533d32e176e21db5 +md5: 0539938c55b6b1a59b560e843ad864a4 +constrains: +- python 3.14.* *_cp314 +license: BSD-3-Clause +license_family: BSD +size: 6989 +timestamp: 1752805904792 +- conda: https://conda.anaconda.org/conda-forge/linux-64/pyyaml-6.0.3-py314h67df5f8_1.conda +sha256: b318fb070c7a1f89980ef124b80a0b5ccf3928143708a85e0053cde0169c699d +md5: 2035f68f96be30dc60a5dfd7452c7941 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- python >=3.14,<3.15.0a0 +- python_abi 3.14.* *_cp314 +- yaml >=0.2.5,<0.3.0a0 +license: MIT +license_family: MIT +size: 202391 +timestamp: 1770223462836 +- conda: https://conda.anaconda.org/conda-forge/linux-64/readline-8.3-h853b02a_0.conda +sha256: 12ffde5a6f958e285aa22c191ca01bbd3d6e710aa852e00618fa6ddc59149002 +md5: d7d95fc8287ea7bf33e0e7116d2b95ec +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- ncurses >=6.5,<7.0a0 +license: GPL-3.0-only +license_family: GPL +size: 345073 +timestamp: 1765813471974 +- conda: https://conda.anaconda.org/conda-forge/noarch/referencing-0.37.0-pyhcf101f3_0.conda +sha256: 0577eedfb347ff94d0f2fa6c052c502989b028216996b45c7f21236f25864414 +md5: 870293df500ca7e18bedefa5838a22ab +depends: +- attrs >=22.2.0 +- python >=3.10 +- rpds-py >=0.7.0 +- typing_extensions >=4.4.0 +- python +license: MIT +license_family: MIT +size: 51788 +timestamp: 1760379115194 +- conda: https://conda.anaconda.org/conda-forge/linux-64/regex-2026.2.28-py314h5bd0f2a_0.conda +sha256: e085e336f1446f5263a3ec9747df8c719b6996753901181add50dc4fdd8bb2e8 +md5: 3c8b6a8c4d0ff5a264e9831eac4941f4 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- python >=3.14,<3.15.0a0 +- python_abi 3.14.* *_cp314 +license: Apache-2.0 AND CNRI-Python +license_family: PSF +size: 411924 +timestamp: 1772255161535 +- conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.5-pyhcf101f3_1.conda +sha256: 7813c38b79ae549504b2c57b3f33394cea4f2ad083f0994d2045c2e24cb538c5 +md5: c65df89a0b2e321045a9e01d1337b182 +depends: +- python >=3.10 +- certifi >=2017.4.17 +- charset-normalizer >=2,<4 +- idna >=2.5,<4 +- urllib3 >=1.21.1,<3 +- python +constrains: +- chardet >=3.0.2,<6 +license: Apache-2.0 +license_family: APACHE +size: 63602 +timestamp: 1766926974520 +- conda: https://conda.anaconda.org/conda-forge/noarch/rich-14.3.3-pyhcf101f3_0.conda +sha256: b06ce84d6a10c266811a7d3adbfa1c11f13393b91cc6f8a5b468277d90be9590 +md5: 7a6289c50631d620652f5045a63eb573 +depends: +- markdown-it-py >=2.2.0 +- pygments >=2.13.0,<3.0.0 +- python >=3.10 +- typing_extensions >=4.0.0,<5.0.0 +- python +license: MIT +license_family: MIT +size: 208472 +timestamp: 1771572730357 +- conda: https://conda.anaconda.org/conda-forge/noarch/rich-click-1.9.7-pyh8f84b5b_0.conda +sha256: aa3fcb167321bae51998de2e94d199109c9024f25a5a063cb1c28d8f1af33436 +md5: 0c20a8ebcddb24a45da89d5e917e6cb9 +depends: +- python >=3.10 +- rich >=12 +- click >=8 +- typing-extensions >=4 +- __unix +- python +license: MIT +license_family: MIT +size: 64356 +timestamp: 1769850479089 +- conda: https://conda.anaconda.org/conda-forge/linux-64/rpds-py-0.30.0-py314h2e6c369_0.conda +sha256: e53b0cbf3b324eaa03ca1fe1a688fdf4ab42cea9c25270b0a7307d8aaaa4f446 +md5: c1c368b5437b0d1a68f372ccf01cb133 +depends: +- python +- libgcc >=14 +- __glibc >=2.17,<3.0.a0 +- python_abi 3.14.* *_cp314 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 376121 +timestamp: 1764543122774 +- conda: https://conda.anaconda.org/conda-forge/noarch/spectra-0.0.11-pyhd8ed1ab_2.conda +sha256: 7c65782d2511738e62c70462e89d65da4fa54d5a7e47c46667bcd27a59f81876 +md5: 472239e4eb7b5a84bb96b3ed7e3a596a +depends: +- colormath >=3.0.0 +- python >=3.9 +license: MIT +license_family: MIT +size: 22284 +timestamp: 1735770589188 +- conda: https://conda.anaconda.org/conda-forge/linux-64/sqlite-3.52.0-h04a0ce9_0.conda +sha256: c9af81e7830d9c4b67a7f48e512d060df2676b29cac59e3b31f09dbfcee29c58 +md5: 7d9d7efe9541d4bb71b5934e8ee348ea +depends: +- __glibc >=2.17,<3.0.a0 +- icu >=78.2,<79.0a0 +- libgcc >=14 +- libsqlite 3.52.0 hf4e2dac_0 +- libzlib >=1.3.1,<2.0a0 +- ncurses >=6.5,<7.0a0 +- readline >=8.3,<9.0a0 +license: blessing +size: 203641 +timestamp: 1772818888368 +- conda: https://conda.anaconda.org/conda-forge/linux-64/tiktoken-0.12.0-py314h67fec18_3.conda +sha256: 7e395d67fd249d901beb1ae269057763c0d8c3ee5f7a348694bdb16d158a37d9 +md5: d705f9d8a1185a2b01cced191177a028 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libstdcxx >=14 +- python >=3.14,<3.15.0a0 +- python_abi 3.14.* *_cp314 +- regex >=2022.1.18 +- requests >=2.26.0 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 939648 +timestamp: 1764028306357 +- conda: https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h366c992_103.conda +sha256: cafeec44494f842ffeca27e9c8b0c27ed714f93ac77ddadc6aaf726b5554ebac +md5: cffd3bdd58090148f4cfcd831f4b26ab +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libzlib >=1.3.1,<2.0a0 +constrains: +- xorg-libx11 >=1.8.12,<2.0a0 +license: TCL +license_family: BSD +size: 3301196 +timestamp: 1769460227866 +- conda: https://conda.anaconda.org/conda-forge/noarch/tqdm-4.67.3-pyh8f84b5b_0.conda +sha256: 9ef8e47cf00e4d6dcc114eb32a1504cc18206300572ef14d76634ba29dfe1eb6 +md5: e5ce43272193b38c2e9037446c1d9206 +depends: +- python >=3.10 +- __unix +- python +license: MPL-2.0 and MIT +size: 94132 +timestamp: 1770153424136 +- conda: https://conda.anaconda.org/conda-forge/noarch/typeguard-4.5.1-pyhd8ed1ab_0.conda +sha256: 39d8ae33c43cdb8f771373e149b0b4fae5a08960ac58dcca95b2f1642bb17448 +md5: 260af1b0a94f719de76b4e14094e9a3b +depends: +- importlib-metadata >=3.6 +- python >=3.10 +- typing-extensions >=4.10.0 +- typing_extensions >=4.14.0 +constrains: +- pytest >=7 +license: MIT +license_family: MIT +size: 36838 +timestamp: 1771532971545 +- conda: https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.15.0-h396c80c_0.conda +sha256: 7c2df5721c742c2a47b2c8f960e718c930031663ac1174da67c1ed5999f7938c +md5: edd329d7d3a4ab45dcf905899a7a6115 +depends: +- typing_extensions ==4.15.0 pyhcf101f3_0 +license: PSF-2.0 +license_family: PSF +size: 91383 +timestamp: 1756220668932 +- conda: https://conda.anaconda.org/conda-forge/noarch/typing-inspection-0.4.2-pyhd8ed1ab_1.conda +sha256: 70db27de58a97aeb7ba7448366c9853f91b21137492e0b4430251a1870aa8ff4 +md5: a0a4a3035667fc34f29bfbd5c190baa6 +depends: +- python >=3.10 +- typing_extensions >=4.12.0 +license: MIT +license_family: MIT +size: 18923 +timestamp: 1764158430324 +- conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda +sha256: 032271135bca55aeb156cee361c81350c6f3fb203f57d024d7e5a1fc9ef18731 +md5: 0caa1af407ecff61170c9437a808404d +depends: +- python >=3.10 +- python +license: PSF-2.0 +license_family: PSF +size: 51692 +timestamp: 1756220668932 +- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda +sha256: 1d30098909076af33a35017eed6f2953af1c769e273a0626a04722ac4acaba3c +md5: ad659d0a2b3e47e38d829aa8cad2d610 +license: LicenseRef-Public-Domain +size: 119135 +timestamp: 1767016325805 +- conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.6.3-pyhd8ed1ab_0.conda +sha256: af641ca7ab0c64525a96fd9ad3081b0f5bcf5d1cbb091afb3f6ed5a9eee6111a +md5: 9272daa869e03efe68833e3dc7a02130 +depends: +- backports.zstd >=1.0.0 +- brotli-python >=1.2.0 +- h2 >=4,<5 +- pysocks >=1.5.6,<2.0,!=1.5.7 +- python >=3.10 +license: MIT +license_family: MIT +size: 103172 +timestamp: 1767817860341 +- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxau-1.0.12-hb03c661_1.conda +sha256: 6bc6ab7a90a5d8ac94c7e300cc10beb0500eeba4b99822768ca2f2ef356f731b +md5: b2895afaf55bf96a8c8282a2e47a5de0 +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +license: MIT +license_family: MIT +size: 15321 +timestamp: 1762976464266 +- conda: https://conda.anaconda.org/conda-forge/linux-64/xorg-libxdmcp-1.1.5-hb03c661_1.conda +sha256: 25d255fb2eef929d21ff660a0c687d38a6d2ccfbcbf0cc6aa738b12af6e9d142 +md5: 1dafce8548e38671bea82e3f5c6ce22f +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +license: MIT +license_family: MIT +size: 20591 +timestamp: 1762976546182 +- conda: https://conda.anaconda.org/conda-forge/linux-64/yaml-0.2.5-h280c20c_3.conda +sha256: 6d9ea2f731e284e9316d95fa61869fe7bbba33df7929f82693c121022810f4ad +md5: a77f85f77be52ff59391544bfe73390a +depends: +- libgcc >=14 +- __glibc >=2.17,<3.0.a0 +license: MIT +license_family: MIT +size: 85189 +timestamp: 1753484064210 +- conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda +sha256: b4533f7d9efc976511a73ef7d4a2473406d7f4c750884be8e8620b0ce70f4dae +md5: 30cd29cb87d819caead4d55184c1d115 +depends: +- python >=3.10 +- python +license: MIT +license_family: MIT +size: 24194 +timestamp: 1764460141901 +- conda: https://conda.anaconda.org/conda-forge/linux-64/zlib-ng-2.3.3-hceb46e0_1.conda +sha256: ea4e50c465d70236408cb0bfe0115609fd14db1adcd8bd30d8918e0291f8a75f +md5: 2aadb0d17215603a82a2a6b0afd9a4cb +depends: +- __glibc >=2.17,<3.0.a0 +- libgcc >=14 +- libstdcxx >=14 +license: Zlib +license_family: Other +size: 122618 +timestamp: 1770167931827 +- conda: https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.7-hb78ec9c_6.conda +sha256: 68f0206ca6e98fea941e5717cec780ed2873ffabc0e1ed34428c061e2c6268c7 +md5: 4a13eeac0b5c8e5b8ab496e6c4ddd829 +depends: +- __glibc >=2.17,<3.0.a0 +- libzlib >=1.3.1,<2.0a0 +license: BSD-3-Clause +license_family: BSD +size: 601375 +timestamp: 1764777111296 diff --git a/modules/nf-core/multiqc/.conda-lock/linux_arm64-bd-40bf3b435e89dc22_1.txt b/modules/nf-core/multiqc/.conda-lock/linux_arm64-bd-40bf3b435e89dc22_1.txt new file mode 100644 index 00000000..a58231a0 --- /dev/null +++ b/modules/nf-core/multiqc/.conda-lock/linux_arm64-bd-40bf3b435e89dc22_1.txt @@ -0,0 +1,1502 @@ + +version: 6 +environments: +default: +channels: +- url: https://conda.anaconda.org/conda-forge/ +- url: https://conda.anaconda.org/bioconda/ +- url: https://conda.anaconda.org/bioconda/ +options: +pypi-prerelease-mode: if-necessary-or-explicit +packages: +linux-aarch64: +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-20_gnu.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/_python_abi3_support-1.0-hd8ed1ab_2.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.7.0-pyhd8ed1ab_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/attrs-26.1.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/backports.zstd-1.3.0-py314h680f03e_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/brotli-python-1.2.0-py314h352cb57_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/bzip2-1.0.8-h4777abc_9.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2026.2.25-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.6-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/coloredlogs-15.0.1-pyhd8ed1ab_4.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/colormath-3.0.0-pyhd8ed1ab_4.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.14.3-py314hd8ed1ab_101.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/expat-2.7.4-hfae3067_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/fontconfig-2.17.1-hba86a56_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-hc364b38_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/humanfriendly-10.0-pyh707e725_8.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/humanize-4.15.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/icu-78.3-hcab7f73_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.11-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.8.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/jsonschema-4.26.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/jsonschema-specifications-2025.9.1-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/kaleido-core-0.2.1-he5a581e_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/lcms2-2.18-h9d5b58d_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ld_impl_linux-aarch64-2.45.1-default_h1979696_102.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/lerc-4.1.0-h52b7260_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libblas-3.11.0-5_haddc8a3_openblas.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcblas-3.11.0-5_hd72aa62_openblas.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdeflate-1.25-h1af38f5_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libexpat-2.7.4-hfae3067_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libffi-3.5.2-h376a255_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libfreetype-2.14.3-h8af1aa0_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libfreetype6-2.14.3-hdae7a39_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-15.2.0-h8acb6b2_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-ng-15.2.0-he9431aa_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgfortran-15.2.0-he9431aa_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgfortran5-15.2.0-h1b7bec0_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgomp-15.2.0-h8acb6b2_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libjpeg-turbo-3.1.2-he30d5cf_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liblapack-3.11.0-5_h88aeb00_openblas.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liblzma-5.8.2-he30d5cf_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libmpdec-4.0.0-he30d5cf_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenblas-0.3.30-pthreads_h9d3fd7e_4.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libpng-1.6.55-h1abf092_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsqlite-3.52.0-h10b116e_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libstdcxx-15.2.0-hef695bb_18.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libtiff-4.7.1-hdb009f0_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libuuid-2.41.3-h1022ec0_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libwebp-base-1.6.0-ha2e29f5_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libxcb-1.17.0-h262b8f6_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libzlib-1.3.2-hdc9db2a_2.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/markdown-3.10.2-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-4.0.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/markupsafe-3.0.3-py314hb76de3f_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/mathjax-2.7.7-h8af1aa0_3.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda +- conda: https://conda.anaconda.org/bioconda/noarch/multiqc-1.33-pyhdfd78af_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/narwhals-2.18.1-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/natsort-8.4.0-pyhcf101f3_2.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ncurses-6.5-ha32ae93_3.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/networkx-3.6.1-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/nspr-4.38-h3ad9384_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/nss-3.118-h544fa81_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/numpy-2.4.3-py314haac167e_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openjpeg-2.5.4-h5da879a_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openssl-3.6.1-h546c87b_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pillow-12.1.1-py314hac3e5ec_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/plotly-6.6.0-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/polars-1.39.3-pyh58ad624_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/polars-lts-cpu-1.34.0.deprecated-hc364b38_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/polars-runtime-32-1.39.3-py310hff09b76_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/polars-runtime-compat-1.39.3-py310hf00a4a2_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/procps-ng-4.0.6-h1779866_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pthread-stubs-0.4-h86ecc28_1002.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/pyaml-env-1.2.2-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-2.12.5-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pydantic-core-2.41.5-py314h451b6cc_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/python-3.14.3-hb06a95a_101_cp314.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/python-dotenv-1.2.2-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.14.3-h4df99d1_101.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/python-kaleido-0.2.1-pyhd8ed1ab_0.tar.bz2 +- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pyyaml-6.0.3-py314h807365f_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/readline-8.3-hb682ff5_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/referencing-0.37.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/regex-2026.2.28-py314h51f160d_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.5-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/rich-14.3.3-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/rich-click-1.9.7-pyh8f84b5b_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/rpds-py-0.30.0-py314h02b7a91_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/spectra-0.0.11-pyhd8ed1ab_2.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/sqlite-3.52.0-hf1c7be2_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/tiktoken-0.12.0-py314h6a36e60_3.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/tk-8.6.13-noxft_h0dc03b3_103.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/tqdm-4.67.3-pyh8f84b5b_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/typeguard-4.5.1-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.15.0-h396c80c_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/typing-inspection-0.4.2-pyhd8ed1ab_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.6.3-pyhd8ed1ab_0.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxau-1.0.12-he30d5cf_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxdmcp-1.1.5-he30d5cf_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/yaml-0.2.5-h80f16a2_3.conda +- conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zlib-ng-2.3.3-ha7cb516_1.conda +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zstd-1.5.7-h85ac4a6_6.conda +packages: +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/_openmp_mutex-4.5-20_gnu.conda +build_number: 20 +sha256: a2527b1d81792a0ccd2c05850960df119c2b6d8f5fdec97f2db7d25dc23b1068 +md5: 468fd3bb9e1f671d36c2cbc677e56f1d +depends: +- libgomp >=7.5.0 +constrains: +- openmp_impl <0.0a0 +license: BSD-3-Clause +license_family: BSD +size: 28926 +timestamp: 1770939656741 +- conda: https://conda.anaconda.org/conda-forge/noarch/_python_abi3_support-1.0-hd8ed1ab_2.conda +sha256: a3967b937b9abf0f2a99f3173fa4630293979bd1644709d89580e7c62a544661 +md5: aaa2a381ccc56eac91d63b6c1240312f +depends: +- cpython +- python-gil +license: MIT +license_family: MIT +size: 8191 +timestamp: 1744137672556 +- conda: https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.7.0-pyhd8ed1ab_1.conda +sha256: e0ea1ba78fbb64f17062601edda82097fcf815012cf52bb704150a2668110d48 +md5: 2934f256a8acfe48f6ebb4fce6cde29c +depends: +- python >=3.9 +- typing-extensions >=4.0.0 +license: MIT +license_family: MIT +size: 18074 +timestamp: 1733247158254 +- conda: https://conda.anaconda.org/conda-forge/noarch/attrs-26.1.0-pyhcf101f3_0.conda +sha256: 1b6124230bb4e571b1b9401537ecff575b7b109cc3a21ee019f65e083b8399ab +md5: c6b0543676ecb1fb2d7643941fe375f2 +depends: +- python >=3.10 +- python +license: MIT +license_family: MIT +size: 64927 +timestamp: 1773935801332 +- conda: https://conda.anaconda.org/conda-forge/noarch/backports.zstd-1.3.0-py314h680f03e_0.conda +noarch: generic +sha256: c31ab719d256bc6f89926131e88ecd0f0c5d003fe8481852c6424f4ec6c7eb29 +md5: a2ac7763a9ac75055b68f325d3255265 +depends: +- python >=3.14 +license: BSD-3-Clause AND MIT AND EPL-2.0 +size: 7514 +timestamp: 1767044983590 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/brotli-python-1.2.0-py314h352cb57_1.conda +sha256: 5a5b0cdcd7ed89c6a8fb830924967f6314a2b71944bc1ebc2c105781ba97aa75 +md5: a1b5c571a0923a205d663d8678df4792 +depends: +- libgcc >=14 +- libstdcxx >=14 +- python >=3.14,<3.15.0a0 +- python >=3.14,<3.15.0a0 *_cp314 +- python_abi 3.14.* *_cp314 +constrains: +- libbrotlicommon 1.2.0 he30d5cf_1 +license: MIT +license_family: MIT +size: 373193 +timestamp: 1764017486851 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/bzip2-1.0.8-h4777abc_9.conda +sha256: b3495077889dde6bb370938e7db82be545c73e8589696ad0843a32221520ad4c +md5: 840d8fc0d7b3209be93080bc20e07f2d +depends: +- libgcc >=14 +license: bzip2-1.0.6 +license_family: BSD +size: 192412 +timestamp: 1771350241232 +- conda: https://conda.anaconda.org/conda-forge/noarch/ca-certificates-2026.2.25-hbd8a1cb_0.conda +sha256: 67cc7101b36421c5913a1687ef1b99f85b5d6868da3abbf6ec1a4181e79782fc +md5: 4492fd26db29495f0ba23f146cd5638d +depends: +- __unix +license: ISC +size: 147413 +timestamp: 1772006283803 +- conda: https://conda.anaconda.org/conda-forge/noarch/certifi-2026.2.25-pyhd8ed1ab_0.conda +sha256: a6b118fd1ed6099dc4fc03f9c492b88882a780fadaef4ed4f93dc70757713656 +md5: 765c4d97e877cdbbb88ff33152b86125 +depends: +- python >=3.10 +license: ISC +size: 151445 +timestamp: 1772001170301 +- conda: https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.4.6-pyhd8ed1ab_0.conda +sha256: d86dfd428b2e3c364fa90e07437c8405d635aa4ef54b25ab51d9c712be4112a5 +md5: 49ee13eb9b8f44d63879c69b8a40a74b +depends: +- python >=3.10 +license: MIT +license_family: MIT +size: 58510 +timestamp: 1773660086450 +- conda: https://conda.anaconda.org/conda-forge/noarch/click-8.3.1-pyh8f84b5b_1.conda +sha256: 38cfe1ee75b21a8361c8824f5544c3866f303af1762693a178266d7f198e8715 +md5: ea8a6c3256897cc31263de9f455e25d9 +depends: +- python >=3.10 +- __unix +- python +license: BSD-3-Clause +license_family: BSD +size: 97676 +timestamp: 1764518652276 +- conda: https://conda.anaconda.org/conda-forge/noarch/coloredlogs-15.0.1-pyhd8ed1ab_4.conda +sha256: 8021c76eeadbdd5784b881b165242db9449783e12ce26d6234060026fd6a8680 +md5: b866ff7007b934d564961066c8195983 +depends: +- humanfriendly >=9.1 +- python >=3.9 +license: MIT +license_family: MIT +size: 43758 +timestamp: 1733928076798 +- conda: https://conda.anaconda.org/conda-forge/noarch/colormath-3.0.0-pyhd8ed1ab_4.conda +sha256: 59c9e29800b483b390467f90e82b0da3a4fbf0612efe1c90813fca232780e160 +md5: 071cf7b0ce333c81718b054066c15102 +depends: +- networkx >=2.0 +- numpy +- python >=3.9 +license: BSD-3-Clause +license_family: BSD +size: 39326 +timestamp: 1735759976140 +- conda: https://conda.anaconda.org/conda-forge/noarch/cpython-3.14.3-py314hd8ed1ab_101.conda +noarch: generic +sha256: 91b06300879df746214f7363d6c27c2489c80732e46a369eb2afc234bcafb44c +md5: 3bb89e4f795e5414addaa531d6b1500a +depends: +- python >=3.14,<3.15.0a0 +- python_abi * *_cp314 +license: Python-2.0 +size: 50078 +timestamp: 1770674447292 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/expat-2.7.4-hfae3067_0.conda +sha256: 5f087bef054c681edcaae84a8c2230585b938691e371ff92957a30707b7fcdf7 +md5: b304307db639831ad7caabd2eac6fca6 +depends: +- libexpat 2.7.4 hfae3067_0 +- libgcc >=14 +license: MIT +license_family: MIT +size: 137701 +timestamp: 1771259543650 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2 +sha256: 58d7f40d2940dd0a8aa28651239adbf5613254df0f75789919c4e6762054403b +md5: 0c96522c6bdaed4b1566d11387caaf45 +license: BSD-3-Clause +license_family: BSD +size: 397370 +timestamp: 1566932522327 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2 +sha256: c52a29fdac682c20d252facc50f01e7c2e7ceac52aa9817aaf0bb83f7559ec5c +md5: 34893075a5c9e55cdafac56607368fc6 +license: OFL-1.1 +license_family: Other +size: 96530 +timestamp: 1620479909603 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2 +sha256: 00925c8c055a2275614b4d983e1df637245e19058d79fc7dd1a93b8d9fb4b139 +md5: 4d59c254e01d9cde7957100457e2d5fb +license: OFL-1.1 +license_family: Other +size: 700814 +timestamp: 1620479612257 +- conda: https://conda.anaconda.org/conda-forge/noarch/font-ttf-ubuntu-0.83-h77eed37_3.conda +sha256: 2821ec1dc454bd8b9a31d0ed22a7ce22422c0aef163c59f49dfdf915d0f0ca14 +md5: 49023d73832ef61042f6a237cb2687e7 +license: LicenseRef-Ubuntu-Font-Licence-Version-1.0 +license_family: Other +size: 1620504 +timestamp: 1727511233259 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/fontconfig-2.17.1-hba86a56_0.conda +sha256: 835aff8615dd8d8fff377679710ce81b8a2c47b6404e21a92fb349fda193a15c +md5: 0fed1ff55f4938a65907f3ecf62609db +depends: +- libexpat >=2.7.4,<3.0a0 +- libfreetype >=2.14.1 +- libfreetype6 >=2.14.1 +- libgcc >=14 +- libuuid >=2.41.3,<3.0a0 +- libzlib >=1.3.1,<2.0a0 +license: MIT +license_family: MIT +size: 279044 +timestamp: 1771382728182 +- conda: https://conda.anaconda.org/conda-forge/noarch/fonts-conda-forge-1-hc364b38_1.conda +sha256: 54eea8469786bc2291cc40bca5f46438d3e062a399e8f53f013b6a9f50e98333 +md5: a7970cd949a077b7cb9696379d338681 +depends: +- font-ttf-ubuntu +- font-ttf-inconsolata +- font-ttf-dejavu-sans-mono +- font-ttf-source-code-pro +license: BSD-3-Clause +license_family: BSD +size: 4059 +timestamp: 1762351264405 +- conda: https://conda.anaconda.org/conda-forge/noarch/h2-4.3.0-pyhcf101f3_0.conda +sha256: 84c64443368f84b600bfecc529a1194a3b14c3656ee2e832d15a20e0329b6da3 +md5: 164fc43f0b53b6e3a7bc7dce5e4f1dc9 +depends: +- python >=3.10 +- hyperframe >=6.1,<7 +- hpack >=4.1,<5 +- python +license: MIT +license_family: MIT +size: 95967 +timestamp: 1756364871835 +- conda: https://conda.anaconda.org/conda-forge/noarch/hpack-4.1.0-pyhd8ed1ab_0.conda +sha256: 6ad78a180576c706aabeb5b4c8ceb97c0cb25f1e112d76495bff23e3779948ba +md5: 0a802cb9888dd14eeefc611f05c40b6e +depends: +- python >=3.9 +license: MIT +license_family: MIT +size: 30731 +timestamp: 1737618390337 +- conda: https://conda.anaconda.org/conda-forge/noarch/humanfriendly-10.0-pyh707e725_8.conda +sha256: fa2071da7fab758c669e78227e6094f6b3608228740808a6de5d6bce83d9e52d +md5: 7fe569c10905402ed47024fc481bb371 +depends: +- __unix +- python >=3.9 +license: MIT +license_family: MIT +size: 73563 +timestamp: 1733928021866 +- conda: https://conda.anaconda.org/conda-forge/noarch/humanize-4.15.0-pyhd8ed1ab_0.conda +sha256: 6c4343b376d0b12a4c75ab992640970d36c933cad1fd924f6a1181fa91710e80 +md5: daddf757c3ecd6067b9af1df1f25d89e +depends: +- python >=3.10 +license: MIT +license_family: MIT +size: 67994 +timestamp: 1766267728652 +- conda: https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.conda +sha256: 77af6f5fe8b62ca07d09ac60127a30d9069fdc3c68d6b256754d0ffb1f7779f8 +md5: 8e6923fc12f1fe8f8c4e5c9f343256ac +depends: +- python >=3.9 +license: MIT +license_family: MIT +size: 17397 +timestamp: 1737618427549 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/icu-78.3-hcab7f73_0.conda +sha256: 49ba6aed2c6b482bb0ba41078057555d29764299bc947b990708617712ef6406 +md5: 546da38c2fa9efacf203e2ad3f987c59 +depends: +- libgcc >=14 +- libstdcxx >=14 +license: MIT +license_family: MIT +size: 12837286 +timestamp: 1773822650615 +- conda: https://conda.anaconda.org/conda-forge/noarch/idna-3.11-pyhd8ed1ab_0.conda +sha256: ae89d0299ada2a3162c2614a9d26557a92aa6a77120ce142f8e0109bbf0342b0 +md5: 53abe63df7e10a6ba605dc5f9f961d36 +depends: +- python >=3.10 +license: BSD-3-Clause +license_family: BSD +size: 50721 +timestamp: 1760286526795 +- conda: https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-8.8.0-pyhcf101f3_0.conda +sha256: 82ab2a0d91ca1e7e63ab6a4939356667ef683905dea631bc2121aa534d347b16 +md5: 080594bf4493e6bae2607e65390c520a +depends: +- python >=3.10 +- zipp >=3.20 +- python +license: Apache-2.0 +license_family: APACHE +size: 34387 +timestamp: 1773931568510 +- conda: https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.6-pyhcf101f3_1.conda +sha256: fc9ca7348a4f25fed2079f2153ecdcf5f9cf2a0bc36c4172420ca09e1849df7b +md5: 04558c96691bed63104678757beb4f8d +depends: +- markupsafe >=2.0 +- python >=3.10 +- python +license: BSD-3-Clause +license_family: BSD +size: 120685 +timestamp: 1764517220861 +- conda: https://conda.anaconda.org/conda-forge/noarch/jsonschema-4.26.0-pyhcf101f3_0.conda +sha256: db973a37d75db8e19b5f44bbbdaead0c68dde745407f281e2a7fe4db74ec51d7 +md5: ada41c863af263cc4c5fcbaff7c3e4dc +depends: +- attrs >=22.2.0 +- jsonschema-specifications >=2023.3.6 +- python >=3.10 +- referencing >=0.28.4 +- rpds-py >=0.25.0 +- python +license: MIT +license_family: MIT +size: 82356 +timestamp: 1767839954256 +- conda: https://conda.anaconda.org/conda-forge/noarch/jsonschema-specifications-2025.9.1-pyhcf101f3_0.conda +sha256: 0a4f3b132f0faca10c89fdf3b60e15abb62ded6fa80aebfc007d05965192aa04 +md5: 439cd0f567d697b20a8f45cb70a1005a +depends: +- python >=3.10 +- referencing >=0.31.0 +- python +license: MIT +license_family: MIT +size: 19236 +timestamp: 1757335715225 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/kaleido-core-0.2.1-he5a581e_0.tar.bz2 +sha256: d3c7f4797566e6f983d16c2a87063a18e4b2d819a66230190a21584d70042755 +md5: 4f0d284f5d11e04277b552eb1c172c7f +depends: +- __glibc >=2.17,<3.0.a0 +- expat >=2.2.10,<3.0.0a0 +- fontconfig +- fonts-conda-forge +- libgcc-ng >=9.3.0 +- mathjax 2.7.* +- nspr >=4.29,<5.0a0 +- nss >=3.62,<4.0a0 +- sqlite >=3.34.0,<4.0a0 +license: MIT +license_family: MIT +size: 65750397 +timestamp: 1615199465742 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/lcms2-2.18-h9d5b58d_0.conda +sha256: 379ef5e91a587137391a6149755d0e929f1a007d2dcb211318ac670a46c8596f +md5: bb960f01525b5e001608afef9d47b79c +depends: +- libgcc >=14 +- libjpeg-turbo >=3.1.2,<4.0a0 +- libtiff >=4.7.1,<4.8.0a0 +license: MIT +license_family: MIT +size: 293039 +timestamp: 1768184778398 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ld_impl_linux-aarch64-2.45.1-default_h1979696_102.conda +sha256: 7abd913d81a9bf00abb699e8987966baa2065f5132e37e815f92d90fc6bba530 +md5: a21644fc4a83da26452a718dc9468d5f +depends: +- zstd >=1.5.7,<1.6.0a0 +constrains: +- binutils_impl_linux-aarch64 2.45.1 +license: GPL-3.0-only +license_family: GPL +size: 875596 +timestamp: 1774197520746 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/lerc-4.1.0-h52b7260_0.conda +sha256: 8957fd460c1c132c8031f65fd5f56ec3807fd71b7cab2c5e2b0937b13404ab36 +md5: d13423b06447113a90b5b1366d4da171 +depends: +- libgcc >=14 +- libstdcxx >=14 +license: Apache-2.0 +license_family: Apache +size: 240444 +timestamp: 1773114901155 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libblas-3.11.0-5_haddc8a3_openblas.conda +build_number: 5 +sha256: 700f3c03d0fba8e687a345404a45fbabe781c1cf92242382f62cef2948745ec4 +md5: 5afcea37a46f76ec1322943b3c4dfdc0 +depends: +- libopenblas >=0.3.30,<0.3.31.0a0 +- libopenblas >=0.3.30,<1.0a0 +constrains: +- mkl <2026 +- libcblas 3.11.0 5*_openblas +- liblapack 3.11.0 5*_openblas +- liblapacke 3.11.0 5*_openblas +- blas 2.305 openblas +license: BSD-3-Clause +license_family: BSD +size: 18369 +timestamp: 1765818610617 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libcblas-3.11.0-5_hd72aa62_openblas.conda +build_number: 5 +sha256: 3fad5c9de161dccb4e42c8b1ae8eccb33f4ed56bccbcced9cbb0956ae7869e61 +md5: 0b2f1143ae2d0aa4c991959d0daaf256 +depends: +- libblas 3.11.0 5_haddc8a3_openblas +constrains: +- liblapack 3.11.0 5*_openblas +- liblapacke 3.11.0 5*_openblas +- blas 2.305 openblas +license: BSD-3-Clause +license_family: BSD +size: 18371 +timestamp: 1765818618899 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libdeflate-1.25-h1af38f5_0.conda +sha256: 48814b73bd462da6eed2e697e30c060ae16af21e9fbed30d64feaf0aad9da392 +md5: a9138815598fe6b91a1d6782ca657b0c +depends: +- libgcc >=14 +license: MIT +license_family: MIT +size: 71117 +timestamp: 1761979776756 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libexpat-2.7.4-hfae3067_0.conda +sha256: 995ce3ad96d0f4b5ed6296b051a0d7b6377718f325bc0e792fbb96b0e369dad7 +md5: 57f3b3da02a50a1be2a6fe847515417d +depends: +- libgcc >=14 +constrains: +- expat 2.7.4.* +license: MIT +license_family: MIT +size: 76564 +timestamp: 1771259530958 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libffi-3.5.2-h376a255_0.conda +sha256: 3df4c539449aabc3443bbe8c492c01d401eea894603087fca2917aa4e1c2dea9 +md5: 2f364feefb6a7c00423e80dcb12db62a +depends: +- libgcc >=14 +license: MIT +license_family: MIT +size: 55952 +timestamp: 1769456078358 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libfreetype-2.14.3-h8af1aa0_0.conda +sha256: 752e4f66283d7deb4c6fd47d88df644d8daa2aaa825a54f3bf350a625190192a +md5: a229e22d4d8814a07702b0919d8e6701 +depends: +- libfreetype6 >=2.14.3 +license: GPL-2.0-only OR FTL +size: 8125 +timestamp: 1774301094057 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libfreetype6-2.14.3-hdae7a39_0.conda +sha256: 8e6b27fe4eec4c2fa7b7769a21973734c8dba1de80086fb0213e58375ac09f4c +md5: b99ed99e42dafb27889483b3098cace7 +depends: +- libgcc >=14 +- libpng >=1.6.55,<1.7.0a0 +- libzlib >=1.3.2,<2.0a0 +constrains: +- freetype >=2.14.3 +license: GPL-2.0-only OR FTL +size: 422941 +timestamp: 1774301093473 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-15.2.0-h8acb6b2_18.conda +sha256: 43df385bedc1cab11993c4369e1f3b04b4ca5d0ea16cba6a0e7f18dbc129fcc9 +md5: 552567ea2b61e3a3035759b2fdb3f9a6 +depends: +- _openmp_mutex >=4.5 +constrains: +- libgcc-ng ==15.2.0=*_18 +- libgomp 15.2.0 h8acb6b2_18 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 622900 +timestamp: 1771378128706 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgcc-ng-15.2.0-he9431aa_18.conda +sha256: 83bb0415f59634dccfa8335d4163d1f6db00a27b36666736f9842b650b92cf2f +md5: 4feebd0fbf61075a1a9c2e9b3936c257 +depends: +- libgcc 15.2.0 h8acb6b2_18 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 27568 +timestamp: 1771378136019 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgfortran-15.2.0-he9431aa_18.conda +sha256: 7dcd7dff2505d56fd5272a6e712ec912f50a46bf07dc6873a7e853694304e6e4 +md5: 41f261f5e4e2e8cbd236c2f1f15dae1b +depends: +- libgfortran5 15.2.0 h1b7bec0_18 +constrains: +- libgfortran-ng ==15.2.0=*_18 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 27587 +timestamp: 1771378169244 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgfortran5-15.2.0-h1b7bec0_18.conda +sha256: 85347670dfb4a8d4c13cd7cae54138dcf2b1606b6bede42eef5507bf5f9660c6 +md5: 574d88ce3348331e962cfa5ed451b247 +depends: +- libgcc >=15.2.0 +constrains: +- libgfortran 15.2.0 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 1486341 +timestamp: 1771378148102 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libgomp-15.2.0-h8acb6b2_18.conda +sha256: fc716f11a6a8525e27a5d332ef6a689210b0d2a4dd1133edc0f530659aa9faa6 +md5: 4faa39bf919939602e594253bd673958 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 588060 +timestamp: 1771378040807 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libjpeg-turbo-3.1.2-he30d5cf_0.conda +sha256: 84064c7c53a64291a585d7215fe95ec42df74203a5bf7615d33d49a3b0f08bb6 +md5: 5109d7f837a3dfdf5c60f60e311b041f +depends: +- libgcc >=14 +constrains: +- jpeg <0.0.0a +license: IJG AND BSD-3-Clause AND Zlib +size: 691818 +timestamp: 1762094728337 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liblapack-3.11.0-5_h88aeb00_openblas.conda +build_number: 5 +sha256: 692222d186d3ffbc99eaf04b5b20181fd26aee1edec1106435a0a755c57cce86 +md5: 88d1e4133d1182522b403e9ba7435f04 +depends: +- libblas 3.11.0 5_haddc8a3_openblas +constrains: +- liblapacke 3.11.0 5*_openblas +- blas 2.305 openblas +- libcblas 3.11.0 5*_openblas +license: BSD-3-Clause +license_family: BSD +size: 18392 +timestamp: 1765818627104 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/liblzma-5.8.2-he30d5cf_0.conda +sha256: 843c46e20519651a3e357a8928352b16c5b94f4cd3d5481acc48be2e93e8f6a3 +md5: 96944e3c92386a12755b94619bae0b35 +depends: +- libgcc >=14 +constrains: +- xz 5.8.2.* +license: 0BSD +size: 125916 +timestamp: 1768754941722 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libmpdec-4.0.0-he30d5cf_1.conda +sha256: 57c0dd12d506e84541c4e877898bd2a59cca141df493d34036f18b2751e0a453 +md5: 7b9813e885482e3ccb1fa212b86d7fd0 +depends: +- libgcc >=14 +license: BSD-2-Clause +license_family: BSD +size: 114056 +timestamp: 1769482343003 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libopenblas-0.3.30-pthreads_h9d3fd7e_4.conda +sha256: 794a7270ea049ec931537874cd8d2de0ef4b3cef71c055cfd8b4be6d2f4228b0 +md5: 11d7d57b7bdd01da745bbf2b67020b2e +depends: +- libgcc >=14 +- libgfortran +- libgfortran5 >=14.3.0 +constrains: +- openblas >=0.3.30,<0.3.31.0a0 +license: BSD-3-Clause +license_family: BSD +size: 4959359 +timestamp: 1763114173544 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libpng-1.6.55-h1abf092_0.conda +sha256: c7378c6b79de4d571d00ad1caf0a4c19d43c9c94077a761abb6ead44d891f907 +md5: be4088903b94ea297975689b3c3aeb27 +depends: +- libgcc >=14 +- libzlib >=1.3.1,<2.0a0 +license: zlib-acknowledgement +size: 340156 +timestamp: 1770691477245 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libsqlite-3.52.0-h10b116e_0.conda +sha256: 1ddaf91b44fae83856276f4cb7ce544ffe41d4b55c1e346b504c6b45f19098d6 +md5: 77891484f18eca74b8ad83694da9815e +depends: +- icu >=78.2,<79.0a0 +- libgcc >=14 +- libzlib >=1.3.1,<2.0a0 +license: blessing +size: 952296 +timestamp: 1772818881550 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libstdcxx-15.2.0-hef695bb_18.conda +sha256: 31fdb9ffafad106a213192d8319b9f810e05abca9c5436b60e507afb35a6bc40 +md5: f56573d05e3b735cb03efeb64a15f388 +depends: +- libgcc 15.2.0 h8acb6b2_18 +constrains: +- libstdcxx-ng ==15.2.0=*_18 +license: GPL-3.0-only WITH GCC-exception-3.1 +license_family: GPL +size: 5541411 +timestamp: 1771378162499 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libtiff-4.7.1-hdb009f0_1.conda +sha256: 7ff79470db39e803e21b8185bc8f19c460666d5557b1378d1b1e857d929c6b39 +md5: 8c6fd84f9c87ac00636007c6131e457d +depends: +- lerc >=4.0.0,<5.0a0 +- libdeflate >=1.25,<1.26.0a0 +- libgcc >=14 +- libjpeg-turbo >=3.1.0,<4.0a0 +- liblzma >=5.8.1,<6.0a0 +- libstdcxx >=14 +- libwebp-base >=1.6.0,<2.0a0 +- libzlib >=1.3.1,<2.0a0 +- zstd >=1.5.7,<1.6.0a0 +license: HPND +size: 488407 +timestamp: 1762022048105 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libuuid-2.41.3-h1022ec0_0.conda +sha256: c37a8e89b700646f3252608f8368e7eb8e2a44886b92776e57ad7601fc402a11 +md5: cf2861212053d05f27ec49c3784ff8bb +depends: +- libgcc >=14 +license: BSD-3-Clause +license_family: BSD +size: 43453 +timestamp: 1766271546875 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libwebp-base-1.6.0-ha2e29f5_0.conda +sha256: b03700a1f741554e8e5712f9b06dd67e76f5301292958cd3cb1ac8c6fdd9ed25 +md5: 24e92d0942c799db387f5c9d7b81f1af +depends: +- libgcc >=14 +constrains: +- libwebp 1.6.0 +license: BSD-3-Clause +license_family: BSD +size: 359496 +timestamp: 1752160685488 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libxcb-1.17.0-h262b8f6_0.conda +sha256: 461cab3d5650ac6db73a367de5c8eca50363966e862dcf60181d693236b1ae7b +md5: cd14ee5cca2464a425b1dbfc24d90db2 +depends: +- libgcc >=13 +- pthread-stubs +- xorg-libxau >=1.0.11,<2.0a0 +- xorg-libxdmcp +license: MIT +license_family: MIT +size: 397493 +timestamp: 1727280745441 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/libzlib-1.3.2-hdc9db2a_2.conda +sha256: eb111e32e5a7313a5bf799c7fb2419051fa2fe7eff74769fac8d5a448b309f7f +md5: 502006882cf5461adced436e410046d1 +constrains: +- zlib 1.3.2 *_2 +license: Zlib +license_family: Other +size: 69833 +timestamp: 1774072605429 +- conda: https://conda.anaconda.org/conda-forge/noarch/markdown-3.10.2-pyhcf101f3_0.conda +sha256: 20e0892592a3e7c683e3d66df704a9425d731486a97c34fc56af4da1106b2b6b +md5: ba0a9221ce1063f31692c07370d062f3 +depends: +- importlib-metadata >=4.4 +- python >=3.10 +- python +license: BSD-3-Clause +license_family: BSD +size: 85893 +timestamp: 1770694658918 +- conda: https://conda.anaconda.org/conda-forge/noarch/markdown-it-py-4.0.0-pyhd8ed1ab_0.conda +sha256: 7b1da4b5c40385791dbc3cc85ceea9fad5da680a27d5d3cb8bfaa185e304a89e +md5: 5b5203189eb668f042ac2b0826244964 +depends: +- mdurl >=0.1,<1 +- python >=3.10 +license: MIT +license_family: MIT +size: 64736 +timestamp: 1754951288511 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/markupsafe-3.0.3-py314hb76de3f_1.conda +sha256: 383c188496d13a55658c06e61e7d4cdff2c9f9d5a0648769fca8250bece7e0ef +md5: e5de3c36dd548b35ff2a8aa49208dcb3 +depends: +- libgcc >=14 +- python >=3.14,<3.15.0a0 +- python_abi 3.14.* *_cp314 +constrains: +- jinja2 >=3.0.0 +license: BSD-3-Clause +license_family: BSD +size: 27913 +timestamp: 1772446407659 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/mathjax-2.7.7-h8af1aa0_3.tar.bz2 +sha256: 8fd4c79d6eda3d4cba73783114305a53a154ada4d1e334d4e02cb3521429599b +md5: 7b08314a6867a9d5648a1c3265e9eb8e +license: Apache-2.0 +license_family: Apache +size: 22257008 +timestamp: 1662784555011 +- conda: https://conda.anaconda.org/conda-forge/noarch/mdurl-0.1.2-pyhd8ed1ab_1.conda +sha256: 78c1bbe1723449c52b7a9df1af2ee5f005209f67e40b6e1d3c7619127c43b1c7 +md5: 592132998493b3ff25fd7479396e8351 +depends: +- python >=3.9 +license: MIT +license_family: MIT +size: 14465 +timestamp: 1733255681319 +- conda: https://conda.anaconda.org/bioconda/noarch/multiqc-1.33-pyhdfd78af_0.conda +sha256: f005760b13093362fc9c997d603dd487de32ab2e821a3cbce52a42bcb8136517 +md5: 698a8a27c2b9d8a542c70cb47099a75e +depends: +- click +- coloredlogs +- humanize +- importlib-metadata +- jinja2 >=3.0.0 +- jsonschema +- markdown +- natsort +- numpy +- packaging +- pillow >=10.2.0 +- plotly >=5.18 +- polars-lts-cpu +- pyaml-env +- pydantic >=2.7.1 +- python >=3.8,!=3.14.1 +- python-dotenv +- python-kaleido 0.2.1 +- pyyaml >=4 +- requests +- rich >=10 +- rich-click +- spectra >=0.0.10 +- tiktoken +- tqdm +- typeguard +license: GPL-3.0-or-later +license_family: GPL3 +size: 4198799 +timestamp: 1765300743879 +- conda: https://conda.anaconda.org/conda-forge/noarch/narwhals-2.18.1-pyhcf101f3_1.conda +sha256: 541fd4390a0687228b8578247f1536a821d9261389a65585af9d1a6f2a14e1e0 +md5: 30bec5e8f4c3969e2b1bd407c5e52afb +depends: +- python >=3.10 +- python +license: MIT +size: 280459 +timestamp: 1774380620329 +- conda: https://conda.anaconda.org/conda-forge/noarch/natsort-8.4.0-pyhcf101f3_2.conda +sha256: aeb1548eb72e4f198e72f19d242fb695b35add2ac7b2c00e0d83687052867680 +md5: e941e85e273121222580723010bd4fa2 +depends: +- python >=3.9 +- python +license: MIT +license_family: MIT +size: 39262 +timestamp: 1770905275632 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/ncurses-6.5-ha32ae93_3.conda +sha256: 91cfb655a68b0353b2833521dc919188db3d8a7f4c64bea2c6a7557b24747468 +md5: 182afabe009dc78d8b73100255ee6868 +depends: +- libgcc >=13 +license: X11 AND BSD-3-Clause +size: 926034 +timestamp: 1738196018799 +- conda: https://conda.anaconda.org/conda-forge/noarch/networkx-3.6.1-pyhcf101f3_0.conda +sha256: f6a82172afc50e54741f6f84527ef10424326611503c64e359e25a19a8e4c1c6 +md5: a2c1eeadae7a309daed9d62c96012a2b +depends: +- python >=3.11 +- python +constrains: +- numpy >=1.25 +- scipy >=1.11.2 +- matplotlib-base >=3.8 +- pandas >=2.0 +license: BSD-3-Clause +license_family: BSD +size: 1587439 +timestamp: 1765215107045 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/nspr-4.38-h3ad9384_0.conda +sha256: 78a06e89285fef242e272998b292c1e621e3ee3dd4fba62ec014e503c7ec118f +md5: 6dd4f07147774bf720075a210f8026b9 +depends: +- libgcc >=14 +- libstdcxx >=14 +license: MPL-2.0 +license_family: MOZILLA +size: 235140 +timestamp: 1762350120355 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/nss-3.118-h544fa81_0.conda +sha256: 48942696889367ffd448f8dccfc080fb7e130b9938a4a3b6b20ef8e6af856463 +md5: 4540f9570d12db2150f42ba036154552 +depends: +- libgcc >=14 +- libsqlite >=3.51.0,<4.0a0 +- libstdcxx >=14 +- libzlib >=1.3.1,<2.0a0 +- nspr >=4.38,<5.0a0 +license: MPL-2.0 +license_family: MOZILLA +size: 2061869 +timestamp: 1763490303490 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/numpy-2.4.3-py314haac167e_0.conda +sha256: a6d42fd88afc57c3b0a57b21a12eff7492dfc419bb61ee3f74e9ba6261dabc88 +md5: 25d896c331481145720a21e5145fad65 +depends: +- python +- libgcc >=14 +- python 3.14.* *_cp314 +- libstdcxx >=14 +- libcblas >=3.9.0,<4.0a0 +- liblapack >=3.9.0,<4.0a0 +- python_abi 3.14.* *_cp314 +- libblas >=3.9.0,<4.0a0 +constrains: +- numpy-base <0a0 +license: BSD-3-Clause +license_family: BSD +size: 8008045 +timestamp: 1773839355275 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openjpeg-2.5.4-h5da879a_0.conda +sha256: bd1bc8bdde5e6c5cbac42d462b939694e40b59be6d0698f668515908640c77b8 +md5: cea962410e327262346d48d01f05936c +depends: +- libgcc >=14 +- libpng >=1.6.50,<1.7.0a0 +- libstdcxx >=14 +- libtiff >=4.7.1,<4.8.0a0 +- libzlib >=1.3.1,<2.0a0 +license: BSD-2-Clause +license_family: BSD +size: 392636 +timestamp: 1758489353577 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/openssl-3.6.1-h546c87b_1.conda +sha256: 7f8048c0e75b2620254218d72b4ae7f14136f1981c5eb555ef61645a9344505f +md5: 25f5885f11e8b1f075bccf4a2da91c60 +depends: +- ca-certificates +- libgcc >=14 +license: Apache-2.0 +license_family: Apache +size: 3692030 +timestamp: 1769557678657 +- conda: https://conda.anaconda.org/conda-forge/noarch/packaging-26.0-pyhcf101f3_0.conda +sha256: c1fc0f953048f743385d31c468b4a678b3ad20caffdeaa94bed85ba63049fd58 +md5: b76541e68fea4d511b1ac46a28dcd2c6 +depends: +- python >=3.8 +- python +license: Apache-2.0 +license_family: APACHE +size: 72010 +timestamp: 1769093650580 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pillow-12.1.1-py314hac3e5ec_0.conda +sha256: 1ca2d1616baad9bccb7ebc425ef2dcd6cebe742fbe91edf226fb606ad371ca0f +md5: d3c959c7efe560b2d7da459d69121fe9 +depends: +- python +- python 3.14.* *_cp314 +- libgcc >=14 +- zlib-ng >=2.3.3,<2.4.0a0 +- libwebp-base >=1.6.0,<2.0a0 +- tk >=8.6.13,<8.7.0a0 +- libfreetype >=2.14.1 +- libfreetype6 >=2.14.1 +- libtiff >=4.7.1,<4.8.0a0 +- lcms2 >=2.18,<3.0a0 +- python_abi 3.14.* *_cp314 +- openjpeg >=2.5.4,<3.0a0 +- libjpeg-turbo >=3.1.2,<4.0a0 +- libxcb >=1.17.0,<2.0a0 +license: HPND +size: 1051828 +timestamp: 1770794010335 +- conda: https://conda.anaconda.org/conda-forge/noarch/plotly-6.6.0-pyhd8ed1ab_0.conda +sha256: c418d325359fc7a0074cea7f081ef1bce26e114d2da8a0154c5d27ecc87a08e7 +md5: 3e9427ee186846052e81fadde8ebe96a +depends: +- narwhals >=1.15.1 +- packaging +- python >=3.10 +constrains: +- ipywidgets >=7.6 +license: MIT +license_family: MIT +size: 5251872 +timestamp: 1772628857717 +- conda: https://conda.anaconda.org/conda-forge/noarch/polars-1.39.3-pyh58ad624_1.conda +sha256: d332c2d5002fc440ae37ed9679ffc21b552f18d20232390005d1dd3bce0888d3 +md5: d5a4e013a30dd8dfde9ab39f45aaf9c1 +depends: +- polars-runtime-32 ==1.39.3 +- python >=3.10 +- python +constrains: +- numpy >=1.16.0 +- pyarrow >=7.0.0 +- fastexcel >=0.9 +- openpyxl >=3.0.0 +- xlsx2csv >=0.8.0 +- connectorx >=0.3.2 +- deltalake >=1.0.0 +- pyiceberg >=0.7.1 +- altair >=5.4.0 +- great_tables >=0.8.0 +- polars-runtime-32 ==1.39.3 +- polars-runtime-64 ==1.39.3 +- polars-runtime-compat ==1.39.3 +license: MIT +license_family: MIT +size: 533495 +timestamp: 1774207987966 +- conda: https://conda.anaconda.org/conda-forge/noarch/polars-lts-cpu-1.34.0.deprecated-hc364b38_0.conda +sha256: e466fb31f67ba9bde18deafeb34263ca5eb25807f39ead0e9d753a8e82c4c4f4 +md5: ef0340e75068ac8ff96462749b5c98e7 +depends: +- polars >=1.34.0 +- polars-runtime-compat >=1.34.0 +license: MIT +license_family: MIT +size: 3902 +timestamp: 1760206808444 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/polars-runtime-32-1.39.3-py310hff09b76_1.conda +noarch: python +sha256: c070be507c5a90df397a47ae0299660be437d5546d68f1bc0fa4402c9f07d59e +md5: 3c1a7c6b4ba8b9fb773ace9723f8a5db +depends: +- python +- libgcc >=14 +- libstdcxx >=14 +- _python_abi3_support 1.* +- cpython >=3.10 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 34785466 +timestamp: 1774207998285 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/polars-runtime-compat-1.39.3-py310hf00a4a2_1.conda +noarch: python +sha256: 683315f1a49e47ce72bf9462419733b40b588b2b3106552d95fd4cd994e174de +md5: dd3464e2132dc3a783e76e5078870c76 +depends: +- python +- libgcc >=14 +- libstdcxx >=14 +- _python_abi3_support 1.* +- cpython >=3.10 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 34652491 +timestamp: 1774207996879 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/procps-ng-4.0.6-h1779866_0.conda +sha256: e9cbcbc94e151ada3d6dc365380aaaf591f65012c16d9a2abaea4b9b90adc402 +md5: ab7288cc39545556d1bc5e71ab2df9a9 +depends: +- libgcc >=14 +- ncurses >=6.5,<7.0a0 +license: GPL-2.0-or-later AND LGPL-2.0-or-later +license_family: GPL +size: 636733 +timestamp: 1769712412683 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pthread-stubs-0.4-h86ecc28_1002.conda +sha256: 977dfb0cb3935d748521dd80262fe7169ab82920afd38ed14b7fee2ea5ec01ba +md5: bb5a90c93e3bac3d5690acf76b4a6386 +depends: +- libgcc >=13 +license: MIT +license_family: MIT +size: 8342 +timestamp: 1726803319942 +- conda: https://conda.anaconda.org/conda-forge/noarch/pyaml-env-1.2.2-pyhd8ed1ab_0.conda +sha256: 58994e0d2ea8584cb399546e6f6896d771995e6121d1a7b6a2c9948388358932 +md5: e17be1016bcc3516827b836cd3e4d9dc +depends: +- python >=3.9 +- pyyaml >=5.0,<=7.0 +license: MIT +license_family: MIT +size: 14645 +timestamp: 1736766960536 +- conda: https://conda.anaconda.org/conda-forge/noarch/pydantic-2.12.5-pyhcf101f3_1.conda +sha256: 868569d9505b7fe246c880c11e2c44924d7613a8cdcc1f6ef85d5375e892f13d +md5: c3946ed24acdb28db1b5d63321dbca7d +depends: +- typing-inspection >=0.4.2 +- typing_extensions >=4.14.1 +- python >=3.10 +- typing-extensions >=4.6.1 +- annotated-types >=0.6.0 +- pydantic-core ==2.41.5 +- python +license: MIT +license_family: MIT +size: 340482 +timestamp: 1764434463101 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pydantic-core-2.41.5-py314h451b6cc_1.conda +sha256: f8acb2d03ebe80fed0032b9a989fc9acfb6735e3cd3f8c704b72728cb31868f6 +md5: 28f5027a1e04d67aa13fac1c5ba79693 +depends: +- python +- typing-extensions >=4.6.0,!=4.7.0 +- libgcc >=14 +- python 3.14.* *_cp314 +- python_abi 3.14.* *_cp314 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 1828339 +timestamp: 1762989038561 +- conda: https://conda.anaconda.org/conda-forge/noarch/pygments-2.19.2-pyhd8ed1ab_0.conda +sha256: 5577623b9f6685ece2697c6eb7511b4c9ac5fb607c9babc2646c811b428fd46a +md5: 6b6ece66ebcae2d5f326c77ef2c5a066 +depends: +- python >=3.9 +license: BSD-2-Clause +license_family: BSD +size: 889287 +timestamp: 1750615908735 +- conda: https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha55dd90_7.conda +sha256: ba3b032fa52709ce0d9fd388f63d330a026754587a2f461117cac9ab73d8d0d8 +md5: 461219d1a5bd61342293efa2c0c90eac +depends: +- __unix +- python >=3.9 +license: BSD-3-Clause +license_family: BSD +size: 21085 +timestamp: 1733217331982 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/python-3.14.3-hb06a95a_101_cp314.conda +build_number: 101 +sha256: 87e9dff5646aba87cecfbc08789634c855871a7325169299d749040b0923a356 +md5: 205011b36899ff0edf41b3db0eda5a44 +depends: +- bzip2 >=1.0.8,<2.0a0 +- ld_impl_linux-aarch64 >=2.36.1 +- libexpat >=2.7.3,<3.0a0 +- libffi >=3.5.2,<3.6.0a0 +- libgcc >=14 +- liblzma >=5.8.2,<6.0a0 +- libmpdec >=4.0.0,<5.0a0 +- libsqlite >=3.51.2,<4.0a0 +- libuuid >=2.41.3,<3.0a0 +- libzlib >=1.3.1,<2.0a0 +- ncurses >=6.5,<7.0a0 +- openssl >=3.5.5,<4.0a0 +- python_abi 3.14.* *_cp314 +- readline >=8.3,<9.0a0 +- tk >=8.6.13,<8.7.0a0 +- tzdata +- zstd >=1.5.7,<1.6.0a0 +license: Python-2.0 +size: 37305578 +timestamp: 1770674395875 +python_site_packages_path: lib/python3.14/site-packages +- conda: https://conda.anaconda.org/conda-forge/noarch/python-dotenv-1.2.2-pyhcf101f3_0.conda +sha256: 74e417a768f59f02a242c25e7db0aa796627b5bc8c818863b57786072aeb85e5 +md5: 130584ad9f3a513cdd71b1fdc1244e9c +depends: +- python >=3.10 +license: BSD-3-Clause +license_family: BSD +size: 27848 +timestamp: 1772388605021 +- conda: https://conda.anaconda.org/conda-forge/noarch/python-gil-3.14.3-h4df99d1_101.conda +sha256: 233aebd94c704ac112afefbb29cf4170b7bc606e22958906f2672081bc50638a +md5: 235765e4ea0d0301c75965985163b5a1 +depends: +- cpython 3.14.3.* +- python_abi * *_cp314 +license: Python-2.0 +size: 50062 +timestamp: 1770674497152 +- conda: https://conda.anaconda.org/conda-forge/noarch/python-kaleido-0.2.1-pyhd8ed1ab_0.tar.bz2 +sha256: e17bf63a30aec33432f1ead86e15e9febde9fc40a7f869c0e766be8d2db44170 +md5: 310259a5b03ff02289d7705f39e2b1d2 +depends: +- kaleido-core 0.2.1.* +- python >=3.5 +license: MIT +license_family: MIT +size: 18320 +timestamp: 1615204747600 +- conda: https://conda.anaconda.org/conda-forge/noarch/python_abi-3.14-8_cp314.conda +build_number: 8 +sha256: ad6d2e9ac39751cc0529dd1566a26751a0bf2542adb0c232533d32e176e21db5 +md5: 0539938c55b6b1a59b560e843ad864a4 +constrains: +- python 3.14.* *_cp314 +license: BSD-3-Clause +license_family: BSD +size: 6989 +timestamp: 1752805904792 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/pyyaml-6.0.3-py314h807365f_1.conda +sha256: 496b5e65dfdd0aaaaa5de0dcaaf3bceea00fcb4398acf152f89e567c82ec1046 +md5: 9ae2c92975118058bd720e9ba2bb7c58 +depends: +- libgcc >=14 +- python >=3.14,<3.15.0a0 +- python >=3.14,<3.15.0a0 *_cp314 +- python_abi 3.14.* *_cp314 +- yaml >=0.2.5,<0.3.0a0 +license: MIT +license_family: MIT +size: 195678 +timestamp: 1770223441816 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/readline-8.3-hb682ff5_0.conda +sha256: fe695f9d215e9a2e3dd0ca7f56435ab4df24f5504b83865e3d295df36e88d216 +md5: 3d49cad61f829f4f0e0611547a9cda12 +depends: +- libgcc >=14 +- ncurses >=6.5,<7.0a0 +license: GPL-3.0-only +license_family: GPL +size: 357597 +timestamp: 1765815673644 +- conda: https://conda.anaconda.org/conda-forge/noarch/referencing-0.37.0-pyhcf101f3_0.conda +sha256: 0577eedfb347ff94d0f2fa6c052c502989b028216996b45c7f21236f25864414 +md5: 870293df500ca7e18bedefa5838a22ab +depends: +- attrs >=22.2.0 +- python >=3.10 +- rpds-py >=0.7.0 +- typing_extensions >=4.4.0 +- python +license: MIT +license_family: MIT +size: 51788 +timestamp: 1760379115194 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/regex-2026.2.28-py314h51f160d_0.conda +sha256: 2080ecea825e1ef91a2422cc0bc63e85db9e38908ed17657fb8f41de7a6eee71 +md5: 818aa2c9f6b3c808da5e7be22a9a424c +depends: +- libgcc >=14 +- python >=3.14,<3.15.0a0 +- python >=3.14,<3.15.0a0 *_cp314 +- python_abi 3.14.* *_cp314 +license: Apache-2.0 AND CNRI-Python +license_family: PSF +size: 408097 +timestamp: 1772255205521 +- conda: https://conda.anaconda.org/conda-forge/noarch/requests-2.32.5-pyhcf101f3_1.conda +sha256: 7813c38b79ae549504b2c57b3f33394cea4f2ad083f0994d2045c2e24cb538c5 +md5: c65df89a0b2e321045a9e01d1337b182 +depends: +- python >=3.10 +- certifi >=2017.4.17 +- charset-normalizer >=2,<4 +- idna >=2.5,<4 +- urllib3 >=1.21.1,<3 +- python +constrains: +- chardet >=3.0.2,<6 +license: Apache-2.0 +license_family: APACHE +size: 63602 +timestamp: 1766926974520 +- conda: https://conda.anaconda.org/conda-forge/noarch/rich-14.3.3-pyhcf101f3_0.conda +sha256: b06ce84d6a10c266811a7d3adbfa1c11f13393b91cc6f8a5b468277d90be9590 +md5: 7a6289c50631d620652f5045a63eb573 +depends: +- markdown-it-py >=2.2.0 +- pygments >=2.13.0,<3.0.0 +- python >=3.10 +- typing_extensions >=4.0.0,<5.0.0 +- python +license: MIT +license_family: MIT +size: 208472 +timestamp: 1771572730357 +- conda: https://conda.anaconda.org/conda-forge/noarch/rich-click-1.9.7-pyh8f84b5b_0.conda +sha256: aa3fcb167321bae51998de2e94d199109c9024f25a5a063cb1c28d8f1af33436 +md5: 0c20a8ebcddb24a45da89d5e917e6cb9 +depends: +- python >=3.10 +- rich >=12 +- click >=8 +- typing-extensions >=4 +- __unix +- python +license: MIT +license_family: MIT +size: 64356 +timestamp: 1769850479089 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/rpds-py-0.30.0-py314h02b7a91_0.conda +sha256: a587240f16eac7c6a80f9585cef679cd1cb9a287b8dfcdd36dcef1f7e7db15dc +md5: e7f6ed9e60043bb5cbcc527764897f0d +depends: +- python +- libgcc >=14 +- python_abi 3.14.* *_cp314 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 376332 +timestamp: 1764543345455 +- conda: https://conda.anaconda.org/conda-forge/noarch/spectra-0.0.11-pyhd8ed1ab_2.conda +sha256: 7c65782d2511738e62c70462e89d65da4fa54d5a7e47c46667bcd27a59f81876 +md5: 472239e4eb7b5a84bb96b3ed7e3a596a +depends: +- colormath >=3.0.0 +- python >=3.9 +license: MIT +license_family: MIT +size: 22284 +timestamp: 1735770589188 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/sqlite-3.52.0-hf1c7be2_0.conda +sha256: 4f8523f5341f0d9e1547085206c6c1f71f9fc7c277443ca363a8cf98add8fc01 +md5: d9634079df93a65ee045b3c75f35cae1 +depends: +- icu >=78.2,<79.0a0 +- libgcc >=14 +- libsqlite 3.52.0 h10b116e_0 +- libzlib >=1.3.1,<2.0a0 +- ncurses >=6.5,<7.0a0 +- readline >=8.3,<9.0a0 +license: blessing +size: 209416 +timestamp: 1772818891689 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/tiktoken-0.12.0-py314h6a36e60_3.conda +sha256: c1da41c79262b27efa168407cfecc47b20270e5fc071a8307f95a2c85fb94170 +md5: 55bf7b559202236157b14323b40f19e6 +depends: +- libgcc >=14 +- libstdcxx >=14 +- python >=3.14,<3.15.0a0 +- python_abi 3.14.* *_cp314 +- regex >=2022.1.18 +- requests >=2.26.0 +constrains: +- __glibc >=2.17 +license: MIT +license_family: MIT +size: 914402 +timestamp: 1764030357702 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/tk-8.6.13-noxft_h0dc03b3_103.conda +sha256: e25c314b52764219f842b41aea2c98a059f06437392268f09b03561e4f6e5309 +md5: 7fc6affb9b01e567d2ef1d05b84aa6ed +depends: +- libgcc >=14 +- libzlib >=1.3.1,<2.0a0 +constrains: +- xorg-libx11 >=1.8.12,<2.0a0 +license: TCL +license_family: BSD +size: 3368666 +timestamp: 1769464148928 +- conda: https://conda.anaconda.org/conda-forge/noarch/tqdm-4.67.3-pyh8f84b5b_0.conda +sha256: 9ef8e47cf00e4d6dcc114eb32a1504cc18206300572ef14d76634ba29dfe1eb6 +md5: e5ce43272193b38c2e9037446c1d9206 +depends: +- python >=3.10 +- __unix +- python +license: MPL-2.0 and MIT +size: 94132 +timestamp: 1770153424136 +- conda: https://conda.anaconda.org/conda-forge/noarch/typeguard-4.5.1-pyhd8ed1ab_0.conda +sha256: 39d8ae33c43cdb8f771373e149b0b4fae5a08960ac58dcca95b2f1642bb17448 +md5: 260af1b0a94f719de76b4e14094e9a3b +depends: +- importlib-metadata >=3.6 +- python >=3.10 +- typing-extensions >=4.10.0 +- typing_extensions >=4.14.0 +constrains: +- pytest >=7 +license: MIT +license_family: MIT +size: 36838 +timestamp: 1771532971545 +- conda: https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.15.0-h396c80c_0.conda +sha256: 7c2df5721c742c2a47b2c8f960e718c930031663ac1174da67c1ed5999f7938c +md5: edd329d7d3a4ab45dcf905899a7a6115 +depends: +- typing_extensions ==4.15.0 pyhcf101f3_0 +license: PSF-2.0 +license_family: PSF +size: 91383 +timestamp: 1756220668932 +- conda: https://conda.anaconda.org/conda-forge/noarch/typing-inspection-0.4.2-pyhd8ed1ab_1.conda +sha256: 70db27de58a97aeb7ba7448366c9853f91b21137492e0b4430251a1870aa8ff4 +md5: a0a4a3035667fc34f29bfbd5c190baa6 +depends: +- python >=3.10 +- typing_extensions >=4.12.0 +license: MIT +license_family: MIT +size: 18923 +timestamp: 1764158430324 +- conda: https://conda.anaconda.org/conda-forge/noarch/typing_extensions-4.15.0-pyhcf101f3_0.conda +sha256: 032271135bca55aeb156cee361c81350c6f3fb203f57d024d7e5a1fc9ef18731 +md5: 0caa1af407ecff61170c9437a808404d +depends: +- python >=3.10 +- python +license: PSF-2.0 +license_family: PSF +size: 51692 +timestamp: 1756220668932 +- conda: https://conda.anaconda.org/conda-forge/noarch/tzdata-2025c-hc9c84f9_1.conda +sha256: 1d30098909076af33a35017eed6f2953af1c769e273a0626a04722ac4acaba3c +md5: ad659d0a2b3e47e38d829aa8cad2d610 +license: LicenseRef-Public-Domain +size: 119135 +timestamp: 1767016325805 +- conda: https://conda.anaconda.org/conda-forge/noarch/urllib3-2.6.3-pyhd8ed1ab_0.conda +sha256: af641ca7ab0c64525a96fd9ad3081b0f5bcf5d1cbb091afb3f6ed5a9eee6111a +md5: 9272daa869e03efe68833e3dc7a02130 +depends: +- backports.zstd >=1.0.0 +- brotli-python >=1.2.0 +- h2 >=4,<5 +- pysocks >=1.5.6,<2.0,!=1.5.7 +- python >=3.10 +license: MIT +license_family: MIT +size: 103172 +timestamp: 1767817860341 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxau-1.0.12-he30d5cf_1.conda +sha256: e9f6e931feeb2f40e1fdbafe41d3b665f1ab6cb39c5880a1fcf9f79a3f3c84a5 +md5: 1c246e1105000c3660558459e2fd6d43 +depends: +- libgcc >=14 +license: MIT +license_family: MIT +size: 16317 +timestamp: 1762977521691 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/xorg-libxdmcp-1.1.5-he30d5cf_1.conda +sha256: 128d72f36bcc8d2b4cdbec07507542e437c7d67f677b7d77b71ed9eeac7d6df1 +md5: bff06dcde4a707339d66d45d96ceb2e2 +depends: +- libgcc >=14 +license: MIT +license_family: MIT +size: 21039 +timestamp: 1762979038025 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/yaml-0.2.5-h80f16a2_3.conda +sha256: 66265e943f32ce02396ad214e27cb35f5b0490b3bd4f064446390f9d67fa5d88 +md5: 032d8030e4a24fe1f72c74423a46fb88 +depends: +- libgcc >=14 +license: MIT +license_family: MIT +size: 88088 +timestamp: 1753484092643 +- conda: https://conda.anaconda.org/conda-forge/noarch/zipp-3.23.0-pyhcf101f3_1.conda +sha256: b4533f7d9efc976511a73ef7d4a2473406d7f4c750884be8e8620b0ce70f4dae +md5: 30cd29cb87d819caead4d55184c1d115 +depends: +- python >=3.10 +- python +license: MIT +license_family: MIT +size: 24194 +timestamp: 1764460141901 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zlib-ng-2.3.3-ha7cb516_1.conda +sha256: 638a3a41a4fbfed52d3c60c8ef5a3693b3f12a5b1a3f58fa29f5698d0a0702e2 +md5: f731af71c723065d91b4c01bb822641b +depends: +- libgcc >=14 +- libstdcxx >=14 +license: Zlib +license_family: Other +size: 121046 +timestamp: 1770167944449 +- conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zstd-1.5.7-h85ac4a6_6.conda +sha256: 569990cf12e46f9df540275146da567d9c618c1e9c7a0bc9d9cfefadaed20b75 +md5: c3655f82dcea2aa179b291e7099c1fcc +depends: +- libzlib >=1.3.1,<2.0a0 +license: BSD-3-Clause +license_family: BSD +size: 614429 +timestamp: 1764777145593 diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml index d02016a0..009874d4 100644 --- a/modules/nf-core/multiqc/environment.yml +++ b/modules/nf-core/multiqc/environment.yml @@ -4,4 +4,4 @@ channels: - conda-forge - bioconda dependencies: - - bioconda::multiqc=1.32 + - bioconda::multiqc=1.33 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index c1158fb0..5376aea1 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -1,24 +1,21 @@ process MULTIQC { + tag "${meta.id}" label 'process_single' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/8c/8c6c120d559d7ee04c7442b61ad7cf5a9e8970be5feefb37d68eeaa60c1034eb/data' : - 'community.wave.seqera.io/library/multiqc:1.32--d58f60e4deb769bf' }" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/34/34e733a9ae16a27e80fe00f863ea1479c96416017f24a907996126283e7ecd4d/data' + : 'community.wave.seqera.io/library/multiqc:1.33--ee7739d47738383b'}" input: - path multiqc_files, stageAs: "?/*" - path(multiqc_config) - path(extra_multiqc_config) - path(multiqc_logo) - path(replace_names) - path(sample_names) + tuple val(meta), path(multiqc_files, stageAs: "?/*"), path(multiqc_config, stageAs: "?/*"), path(multiqc_logo), path(replace_names), path(sample_names) output: - path "*multiqc_report.html", emit: report - path "*_data" , emit: data - path "*_plots" , optional:true, emit: plots - path "versions.yml" , emit: versions + tuple val(meta), path("*.html"), emit: report + tuple val(meta), path("*_data"), emit: data + tuple val(meta), path("*_plots"), emit: plots, optional: true + // MultiQC should not push its versions to the `versions` topic. Its input depends on the versions topic to be resolved thus outputting to the topic will let the pipeline hang forever + tuple val("${task.process}"), val('multiqc'), eval('multiqc --version | sed "s/.* //g"'), emit: versions when: task.ext.when == null || task.ext.when @@ -26,38 +23,28 @@ process MULTIQC { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ? "--filename ${task.ext.prefix}.html" : '' - def config = multiqc_config ? "--config $multiqc_config" : '' - def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' + def config = multiqc_config ? multiqc_config instanceof List ? "--config ${multiqc_config.join(' --config ')}" : "--config ${multiqc_config}" : "" def logo = multiqc_logo ? "--cl-config 'custom_logo: \"${multiqc_logo}\"'" : '' def replace = replace_names ? "--replace-names ${replace_names}" : '' def samples = sample_names ? "--sample-names ${sample_names}" : '' """ multiqc \\ --force \\ - $args \\ - $config \\ - $prefix \\ - $extra_config \\ - $logo \\ - $replace \\ - $samples \\ + ${args} \\ + ${config} \\ + ${prefix} \\ + ${logo} \\ + ${replace} \\ + ${samples} \\ . - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS """ stub: """ mkdir multiqc_data + touch multiqc_data/.stub mkdir multiqc_plots + touch multiqc_plots/.stub touch multiqc_report.html - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS """ } diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index ce30eb73..57cf43ca 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -1,6 +1,6 @@ name: multiqc -description: Aggregate results from bioinformatics analyses across many samples into - a single report +description: Aggregate results from bioinformatics analyses across many samples + into a single report keywords: - QC - bioinformatics tools @@ -12,74 +12,91 @@ tools: It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. homepage: https://multiqc.info/ documentation: https://multiqc.info/docs/ - licence: ["GPL-3.0-or-later"] + licence: + - "GPL-3.0-or-later" identifier: biotools:multiqc input: - - multiqc_files: - type: file - description: | - List of reports / files recognised by MultiQC, for example the html and zip output of FastQC - ontologies: [] - - multiqc_config: - type: file - description: Optional config yml for MultiQC - pattern: "*.{yml,yaml}" - ontologies: - - edam: http://edamontology.org/format_3750 # YAML - - extra_multiqc_config: - type: file - description: Second optional config yml for MultiQC. Will override common sections - in multiqc_config. - pattern: "*.{yml,yaml}" - ontologies: - - edam: http://edamontology.org/format_3750 # YAML - - multiqc_logo: - type: file - description: Optional logo file for MultiQC - pattern: "*.{png}" - ontologies: [] - - replace_names: - type: file - description: | - Optional two-column sample renaming file. First column a set of - patterns, second column a set of corresponding replacements. Passed via - MultiQC's `--replace-names` option. - pattern: "*.{tsv}" - ontologies: - - edam: http://edamontology.org/format_3475 # TSV - - sample_names: - type: file - description: | - Optional TSV file with headers, passed to the MultiQC --sample_names - argument. - pattern: "*.{tsv}" - ontologies: - - edam: http://edamontology.org/format_3475 # TSV -output: - report: - - "*multiqc_report.html": + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample1', single_end:false ] + - multiqc_files: type: file - description: MultiQC report file - pattern: "multiqc_report.html" + description: | + List of reports / files recognised by MultiQC, for example the html and zip output of FastQC ontologies: [] - data: - - "*_data": - type: directory - description: MultiQC data dir - pattern: "multiqc_data" - plots: - - "*_plots": + - multiqc_config: + type: file + description: Optional config yml for MultiQC + pattern: "*.{yml,yaml}" + ontologies: + - edam: http://edamontology.org/format_3750 + - multiqc_logo: type: file - description: Plots created by MultiQC - pattern: "*_data" + description: Optional logo file for MultiQC + pattern: "*.{png}" ontologies: [] - versions: - - versions.yml: + - replace_names: + type: file + description: | + Optional two-column sample renaming file. First column a set of + patterns, second column a set of corresponding replacements. Passed via + MultiQC's `--replace-names` option. + pattern: "*.{tsv}" + ontologies: + - edam: http://edamontology.org/format_3475 + - sample_names: type: file - description: File containing software versions - pattern: "versions.yml" + description: | + Optional TSV file with headers, passed to the MultiQC --sample_names + argument. + pattern: "*.{tsv}" ontologies: - - edam: http://edamontology.org/format_3750 # YAML + - edam: http://edamontology.org/format_3475 +output: + report: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample1', single_end:false ] + - "*.html": + type: file + description: MultiQC report file + pattern: ".html" + ontologies: [] + data: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample1', single_end:false ] + - "*_data": + type: directory + description: MultiQC data dir + pattern: "multiqc_data" + plots: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample1', single_end:false ] + - "*_plots": + type: file + description: Plots created by MultiQC + pattern: "*_plots" + ontologies: [] + versions: + - - ${task.process}: + type: string + description: The process the versions were collected from + - multiqc: + type: string + description: The tool name + - multiqc --version | sed "s/.* //g": + type: eval + description: The expression to obtain the version of the tool authors: - "@abhi18av" - "@bunop" @@ -90,3 +107,27 @@ maintainers: - "@bunop" - "@drpatelh" - "@jfy133" +containers: + conda: + linux/amd64: + lock_file: modules/nf-core/multiqc/.conda-lock/linux_amd64-bd-c1f4a7982b743963_1.txt + linux/arm64: + lock_file: modules/nf-core/multiqc/.conda-lock/linux_arm64-bd-40bf3b435e89dc22_1.txt + docker: + linux/amd64: + name: community.wave.seqera.io/library/multiqc:1.33--c1f4a7982b743963 + build_id: bd-c1f4a7982b743963_1 + scan_id: sc-b7b7f470b2a16699_1 + linux/arm64: + name: community.wave.seqera.io/library/multiqc:1.33--40bf3b435e89dc22 + build_id: bd-40bf3b435e89dc22_1 + scan_id: sc-0e2108a0e7368d2f_1 + singularity: + linux/amd64: + name: oras://community.wave.seqera.io/library/multiqc:1.33--9b3473b1c4bb0493 + build_id: bd-9b3473b1c4bb0493_1 + https: https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c4/c4e6d9f669e1a99b53c7dc5cdd6b8e7fd6654032c755bb783cc9849e8203f4d1/data + linux/arm64: + name: oras://community.wave.seqera.io/library/multiqc:1.33--e1ef2065eb21b530 + build_id: bd-e1ef2065eb21b530_1 + https: https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/2a/2acce766e3efb280fa43acdbe85305ea6496ddadbcaa2d806ac4985dfe4686ce/data diff --git a/modules/nf-core/multiqc/tests/custom_prefix.config b/modules/nf-core/multiqc/tests/custom_prefix.config new file mode 100644 index 00000000..b30b1358 --- /dev/null +++ b/modules/nf-core/multiqc/tests/custom_prefix.config @@ -0,0 +1,5 @@ +process { + withName: 'MULTIQC' { + ext.prefix = "custom_prefix" + } +} diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test index 33316a7d..4cbdb95d 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ b/modules/nf-core/multiqc/tests/main.nf.test @@ -15,25 +15,84 @@ nextflow_process { when { process { """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) - input[1] = [] - input[2] = [] - input[3] = [] - input[4] = [] - input[5] = [] + input[0] = channel.of([ + [ id: 'FASTQC' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true), + [], + [], + [], + [] + ]) """ } } then { - assertAll( - { assert process.success }, - { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, - { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("multiqc_versions_single") } - ) + assert process.success + assert snapshot( + sanitizeOutput(process.out).collectEntries { key, val -> + if (key == "data") { + return [key, val.collect { [path(it[1]).list().collect { file(it.toString()).name }] }] + } + else if (key == "plots") { + return [key, val.collect { [ + "pdf", + path("${it[1]}/pdf").list().collect { file(it.toString()).name }, + "png", + path("${it[1]}/png").list().collect { file(it.toString()).name }, + "svg", + path("${it[1]}/svg").list().collect { file(it.toString()).name }] }] + } + else if (key == "report") { + return [key, file(val[0][1].toString()).name] + } + return [key, val] + } + ).match() + } + } + + test("sarscov2 single-end [fastqc] - custom prefix") { + config "./custom_prefix.config" + + when { + process { + """ + input[0] = channel.of([ + [ id: 'FASTQC' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true), + [], + [], + [], + [] + ]) + """ + } } + then { + assert process.success + assert snapshot( + sanitizeOutput(process.out).collectEntries { key, val -> + if (key == "data") { + return [key, val.collect { [path(it[1]).list().collect { file(it.toString()).name }] }] + } + else if (key == "plots") { + return [key, val.collect { [ + "pdf", + path("${it[1]}/pdf").list().collect { file(it.toString()).name }, + "png", + path("${it[1]}/png").list().collect { file(it.toString()).name }, + "svg", + path("${it[1]}/svg").list().collect { file(it.toString()).name }] }] + } + else if (key == "report") { + return [key, file(val[0][1].toString()).name] + } + return [key, val] + } + ).match() + } } test("sarscov2 single-end [fastqc] [config]") { @@ -41,23 +100,85 @@ nextflow_process { when { process { """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) - input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) - input[2] = [] - input[3] = [] - input[4] = [] - input[5] = [] + input[0] = channel.of([ + [ id: 'FASTQC' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true), + file("https://raw.githubusercontent.com/nf-core/seqinspector/1.0.0/assets/multiqc_config.yml", checkIfExists: true), + [], + [], + [] + ]) """ } } then { - assertAll( - { assert process.success }, - { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, - { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("multiqc_versions_config") } - ) + assert process.success + assert snapshot( + sanitizeOutput(process.out).collectEntries { key, val -> + if (key == "data") { + return [key, val.collect { [path(it[1]).list().collect { file(it.toString()).name }] }] + } + else if (key == "plots") { + return [key, val.collect { [ + "pdf", + path("${it[1]}/pdf").list().collect { file(it.toString()).name }, + "png", + path("${it[1]}/png").list().collect { file(it.toString()).name }, + "svg", + path("${it[1]}/svg").list().collect { file(it.toString()).name }] }] + } + else if (key == "report") { + return [key, file(val[0][1].toString()).name] + } + return [key, val] + } + ).match() + } + } + + test("sarscov2 single-end [fastqc] [multiple configs]") { + + when { + process { + """ + input[0] = channel.of([ + [ id: 'FASTQC' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true), + [ + file("https://raw.githubusercontent.com/nf-core/seqinspector/1.0.0/assets/multiqc_config.yml", checkIfExists: true), + file("https://raw.githubusercontent.com/nf-core/seqinspector/1.0.0/assets/multiqc_config.yml", checkIfExists: true) + ], + [], + [], + [] + ]) + """ + } + } + + then { + assert process.success + assert snapshot( + sanitizeOutput(process.out).collectEntries { key, val -> + if (key == "data") { + return [key, val.collect { [path(it[1]).list().collect { file(it.toString()).name }] }] + } + else if (key == "plots") { + return [key, val.collect { [ + "pdf", + path("${it[1]}/pdf").list().collect { file(it.toString()).name }, + "png", + path("${it[1]}/png").list().collect { file(it.toString()).name }, + "svg", + path("${it[1]}/svg").list().collect { file(it.toString()).name }] }] + } + else if (key == "report") { + return [key, file(val[0][1].toString()).name] + } + return [key, val] + } + ).match() } } @@ -68,25 +189,23 @@ nextflow_process { when { process { """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) - input[1] = [] - input[2] = [] - input[3] = [] - input[4] = [] - input[5] = [] + input[0] = channel.of([ + [ id: 'FASTQC' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true), + [], + [], + [], + [] + ]) """ } } then { + assert process.success assertAll( - { assert process.success }, - { assert snapshot(process.out.report.collect { file(it).getName() } + - process.out.data.collect { file(it).getName() } + - process.out.plots.collect { file(it).getName() } + - process.out.versions ).match("multiqc_stub") } + { assert snapshot(sanitizeOutput(process.out)).match() } ) } - } } diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap index a88bafd6..3bfc524f 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test.snap +++ b/modules/nf-core/multiqc/tests/main.nf.test.snap @@ -1,41 +1,422 @@ { - "multiqc_versions_single": { + "sarscov2 single-end [fastqc] [multiple configs]": { "content": [ - [ - "versions.yml:md5,737bb2c7cad54ffc2ec020791dc48b8f" - ] + { + "data": [ + [ + [ + "fastqc-status-check-heatmap.txt", + "fastqc_overrepresented_sequences_plot.txt", + "fastqc_per_base_n_content_plot.txt", + "fastqc_per_base_sequence_quality_plot.txt", + "fastqc_per_sequence_gc_content_plot_Counts.txt", + "fastqc_per_sequence_gc_content_plot_Percentages.txt", + "fastqc_per_sequence_quality_scores_plot.txt", + "fastqc_sequence_counts_plot.txt", + "fastqc_sequence_duplication_levels_plot.txt", + "fastqc_sequence_length_distribution_plot.txt", + "fastqc_top_overrepresented_sequences_table.txt", + "llms-full.txt", + "multiqc.log", + "multiqc.parquet", + "multiqc_citations.txt", + "multiqc_data.json", + "multiqc_fastqc.txt", + "multiqc_general_stats.txt", + "multiqc_sources.txt" + ] + ] + ], + "plots": [ + [ + "pdf", + [ + "fastqc-status-check-heatmap.pdf", + "fastqc_overrepresented_sequences_plot.pdf", + "fastqc_per_base_n_content_plot.pdf", + "fastqc_per_base_sequence_quality_plot.pdf", + "fastqc_per_sequence_gc_content_plot_Counts.pdf", + "fastqc_per_sequence_gc_content_plot_Percentages.pdf", + "fastqc_per_sequence_quality_scores_plot.pdf", + "fastqc_sequence_counts_plot-cnt.pdf", + "fastqc_sequence_counts_plot-pct.pdf", + "fastqc_sequence_duplication_levels_plot.pdf", + "fastqc_sequence_length_distribution_plot.pdf", + "fastqc_top_overrepresented_sequences_table.pdf" + ], + "png", + [ + "fastqc-status-check-heatmap.png", + "fastqc_overrepresented_sequences_plot.png", + "fastqc_per_base_n_content_plot.png", + "fastqc_per_base_sequence_quality_plot.png", + "fastqc_per_sequence_gc_content_plot_Counts.png", + "fastqc_per_sequence_gc_content_plot_Percentages.png", + "fastqc_per_sequence_quality_scores_plot.png", + "fastqc_sequence_counts_plot-cnt.png", + "fastqc_sequence_counts_plot-pct.png", + "fastqc_sequence_duplication_levels_plot.png", + "fastqc_sequence_length_distribution_plot.png", + "fastqc_top_overrepresented_sequences_table.png" + ], + "svg", + [ + "fastqc-status-check-heatmap.svg", + "fastqc_overrepresented_sequences_plot.svg", + "fastqc_per_base_n_content_plot.svg", + "fastqc_per_base_sequence_quality_plot.svg", + "fastqc_per_sequence_gc_content_plot_Counts.svg", + "fastqc_per_sequence_gc_content_plot_Percentages.svg", + "fastqc_per_sequence_quality_scores_plot.svg", + "fastqc_sequence_counts_plot-cnt.svg", + "fastqc_sequence_counts_plot-pct.svg", + "fastqc_sequence_duplication_levels_plot.svg", + "fastqc_sequence_length_distribution_plot.svg", + "fastqc_top_overrepresented_sequences_table.svg" + ] + ] + ], + "report": "multiqc_report.html", + "versions": [ + [ + "MULTIQC", + "multiqc", + "1.33" + ] + ] + } ], + "timestamp": "2026-03-17T16:15:42.577775492", "meta": { - "nf-test": "0.9.3", - "nextflow": "24.10.4" - }, - "timestamp": "2025-10-27T13:33:24.356715" + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } }, - "multiqc_stub": { + "sarscov2 single-end [fastqc]": { "content": [ - [ - "multiqc_report.html", - "multiqc_data", - "multiqc_plots", - "versions.yml:md5,737bb2c7cad54ffc2ec020791dc48b8f" - ] + { + "data": [ + [ + [ + "fastqc-status-check-heatmap.txt", + "fastqc_overrepresented_sequences_plot.txt", + "fastqc_per_base_n_content_plot.txt", + "fastqc_per_base_sequence_quality_plot.txt", + "fastqc_per_sequence_gc_content_plot_Counts.txt", + "fastqc_per_sequence_gc_content_plot_Percentages.txt", + "fastqc_per_sequence_quality_scores_plot.txt", + "fastqc_sequence_counts_plot.txt", + "fastqc_sequence_duplication_levels_plot.txt", + "fastqc_sequence_length_distribution_plot.txt", + "fastqc_top_overrepresented_sequences_table.txt", + "llms-full.txt", + "multiqc.log", + "multiqc.parquet", + "multiqc_citations.txt", + "multiqc_data.json", + "multiqc_fastqc.txt", + "multiqc_general_stats.txt", + "multiqc_software_versions.txt", + "multiqc_sources.txt" + ] + ] + ], + "plots": [ + [ + "pdf", + [ + "fastqc-status-check-heatmap.pdf", + "fastqc_overrepresented_sequences_plot.pdf", + "fastqc_per_base_n_content_plot.pdf", + "fastqc_per_base_sequence_quality_plot.pdf", + "fastqc_per_sequence_gc_content_plot_Counts.pdf", + "fastqc_per_sequence_gc_content_plot_Percentages.pdf", + "fastqc_per_sequence_quality_scores_plot.pdf", + "fastqc_sequence_counts_plot-cnt.pdf", + "fastqc_sequence_counts_plot-pct.pdf", + "fastqc_sequence_duplication_levels_plot.pdf", + "fastqc_sequence_length_distribution_plot.pdf", + "fastqc_top_overrepresented_sequences_table.pdf" + ], + "png", + [ + "fastqc-status-check-heatmap.png", + "fastqc_overrepresented_sequences_plot.png", + "fastqc_per_base_n_content_plot.png", + "fastqc_per_base_sequence_quality_plot.png", + "fastqc_per_sequence_gc_content_plot_Counts.png", + "fastqc_per_sequence_gc_content_plot_Percentages.png", + "fastqc_per_sequence_quality_scores_plot.png", + "fastqc_sequence_counts_plot-cnt.png", + "fastqc_sequence_counts_plot-pct.png", + "fastqc_sequence_duplication_levels_plot.png", + "fastqc_sequence_length_distribution_plot.png", + "fastqc_top_overrepresented_sequences_table.png" + ], + "svg", + [ + "fastqc-status-check-heatmap.svg", + "fastqc_overrepresented_sequences_plot.svg", + "fastqc_per_base_n_content_plot.svg", + "fastqc_per_base_sequence_quality_plot.svg", + "fastqc_per_sequence_gc_content_plot_Counts.svg", + "fastqc_per_sequence_gc_content_plot_Percentages.svg", + "fastqc_per_sequence_quality_scores_plot.svg", + "fastqc_sequence_counts_plot-cnt.svg", + "fastqc_sequence_counts_plot-pct.svg", + "fastqc_sequence_duplication_levels_plot.svg", + "fastqc_sequence_length_distribution_plot.svg", + "fastqc_top_overrepresented_sequences_table.svg" + ] + ] + ], + "report": "multiqc_report.html", + "versions": [ + [ + "MULTIQC", + "multiqc", + "1.33" + ] + ] + } ], + "timestamp": "2026-03-17T16:21:17.072841555", "meta": { - "nf-test": "0.9.3", - "nextflow": "24.10.4" - }, - "timestamp": "2025-10-27T13:34:11.103619" + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } }, - "multiqc_versions_config": { + "sarscov2 single-end [fastqc] - stub": { "content": [ - [ - "versions.yml:md5,737bb2c7cad54ffc2ec020791dc48b8f" - ] + { + "data": [ + [ + { + "id": "FASTQC" + }, + [ + ".stub:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "plots": [ + [ + { + "id": "FASTQC" + }, + [ + ".stub:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "report": [ + [ + { + "id": "FASTQC" + }, + "multiqc_report.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + [ + "MULTIQC", + "multiqc", + "1.33" + ] + ] + } ], + "timestamp": "2026-02-26T15:14:39.789193051", "meta": { - "nf-test": "0.9.3", - "nextflow": "24.10.4" - }, - "timestamp": "2025-10-27T13:34:04.615233" + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 single-end [fastqc] [config]": { + "content": [ + { + "data": [ + [ + [ + "fastqc-status-check-heatmap.txt", + "fastqc_overrepresented_sequences_plot.txt", + "fastqc_per_base_n_content_plot.txt", + "fastqc_per_base_sequence_quality_plot.txt", + "fastqc_per_sequence_gc_content_plot_Counts.txt", + "fastqc_per_sequence_gc_content_plot_Percentages.txt", + "fastqc_per_sequence_quality_scores_plot.txt", + "fastqc_sequence_counts_plot.txt", + "fastqc_sequence_duplication_levels_plot.txt", + "fastqc_sequence_length_distribution_plot.txt", + "fastqc_top_overrepresented_sequences_table.txt", + "llms-full.txt", + "multiqc.log", + "multiqc.parquet", + "multiqc_citations.txt", + "multiqc_data.json", + "multiqc_fastqc.txt", + "multiqc_general_stats.txt", + "multiqc_sources.txt" + ] + ] + ], + "plots": [ + [ + "pdf", + [ + "fastqc-status-check-heatmap.pdf", + "fastqc_overrepresented_sequences_plot.pdf", + "fastqc_per_base_n_content_plot.pdf", + "fastqc_per_base_sequence_quality_plot.pdf", + "fastqc_per_sequence_gc_content_plot_Counts.pdf", + "fastqc_per_sequence_gc_content_plot_Percentages.pdf", + "fastqc_per_sequence_quality_scores_plot.pdf", + "fastqc_sequence_counts_plot-cnt.pdf", + "fastqc_sequence_counts_plot-pct.pdf", + "fastqc_sequence_duplication_levels_plot.pdf", + "fastqc_sequence_length_distribution_plot.pdf", + "fastqc_top_overrepresented_sequences_table.pdf" + ], + "png", + [ + "fastqc-status-check-heatmap.png", + "fastqc_overrepresented_sequences_plot.png", + "fastqc_per_base_n_content_plot.png", + "fastqc_per_base_sequence_quality_plot.png", + "fastqc_per_sequence_gc_content_plot_Counts.png", + "fastqc_per_sequence_gc_content_plot_Percentages.png", + "fastqc_per_sequence_quality_scores_plot.png", + "fastqc_sequence_counts_plot-cnt.png", + "fastqc_sequence_counts_plot-pct.png", + "fastqc_sequence_duplication_levels_plot.png", + "fastqc_sequence_length_distribution_plot.png", + "fastqc_top_overrepresented_sequences_table.png" + ], + "svg", + [ + "fastqc-status-check-heatmap.svg", + "fastqc_overrepresented_sequences_plot.svg", + "fastqc_per_base_n_content_plot.svg", + "fastqc_per_base_sequence_quality_plot.svg", + "fastqc_per_sequence_gc_content_plot_Counts.svg", + "fastqc_per_sequence_gc_content_plot_Percentages.svg", + "fastqc_per_sequence_quality_scores_plot.svg", + "fastqc_sequence_counts_plot-cnt.svg", + "fastqc_sequence_counts_plot-pct.svg", + "fastqc_sequence_duplication_levels_plot.svg", + "fastqc_sequence_length_distribution_plot.svg", + "fastqc_top_overrepresented_sequences_table.svg" + ] + ] + ], + "report": "multiqc_report.html", + "versions": [ + [ + "MULTIQC", + "multiqc", + "1.33" + ] + ] + } + ], + "timestamp": "2026-03-17T16:15:30.372239611", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "sarscov2 single-end [fastqc] - custom prefix": { + "content": [ + { + "data": [ + [ + [ + "fastqc-status-check-heatmap.txt", + "fastqc_overrepresented_sequences_plot.txt", + "fastqc_per_base_n_content_plot.txt", + "fastqc_per_base_sequence_quality_plot.txt", + "fastqc_per_sequence_gc_content_plot_Counts.txt", + "fastqc_per_sequence_gc_content_plot_Percentages.txt", + "fastqc_per_sequence_quality_scores_plot.txt", + "fastqc_sequence_counts_plot.txt", + "fastqc_sequence_duplication_levels_plot.txt", + "fastqc_sequence_length_distribution_plot.txt", + "fastqc_top_overrepresented_sequences_table.txt", + "llms-full.txt", + "multiqc.log", + "multiqc.parquet", + "multiqc_citations.txt", + "multiqc_data.json", + "multiqc_fastqc.txt", + "multiqc_general_stats.txt", + "multiqc_software_versions.txt", + "multiqc_sources.txt" + ] + ] + ], + "plots": [ + [ + "pdf", + [ + "fastqc-status-check-heatmap.pdf", + "fastqc_overrepresented_sequences_plot.pdf", + "fastqc_per_base_n_content_plot.pdf", + "fastqc_per_base_sequence_quality_plot.pdf", + "fastqc_per_sequence_gc_content_plot_Counts.pdf", + "fastqc_per_sequence_gc_content_plot_Percentages.pdf", + "fastqc_per_sequence_quality_scores_plot.pdf", + "fastqc_sequence_counts_plot-cnt.pdf", + "fastqc_sequence_counts_plot-pct.pdf", + "fastqc_sequence_duplication_levels_plot.pdf", + "fastqc_sequence_length_distribution_plot.pdf", + "fastqc_top_overrepresented_sequences_table.pdf" + ], + "png", + [ + "fastqc-status-check-heatmap.png", + "fastqc_overrepresented_sequences_plot.png", + "fastqc_per_base_n_content_plot.png", + "fastqc_per_base_sequence_quality_plot.png", + "fastqc_per_sequence_gc_content_plot_Counts.png", + "fastqc_per_sequence_gc_content_plot_Percentages.png", + "fastqc_per_sequence_quality_scores_plot.png", + "fastqc_sequence_counts_plot-cnt.png", + "fastqc_sequence_counts_plot-pct.png", + "fastqc_sequence_duplication_levels_plot.png", + "fastqc_sequence_length_distribution_plot.png", + "fastqc_top_overrepresented_sequences_table.png" + ], + "svg", + [ + "fastqc-status-check-heatmap.svg", + "fastqc_overrepresented_sequences_plot.svg", + "fastqc_per_base_n_content_plot.svg", + "fastqc_per_base_sequence_quality_plot.svg", + "fastqc_per_sequence_gc_content_plot_Counts.svg", + "fastqc_per_sequence_gc_content_plot_Percentages.svg", + "fastqc_per_sequence_quality_scores_plot.svg", + "fastqc_sequence_counts_plot-cnt.svg", + "fastqc_sequence_counts_plot-pct.svg", + "fastqc_sequence_duplication_levels_plot.svg", + "fastqc_sequence_length_distribution_plot.svg", + "fastqc_top_overrepresented_sequences_table.svg" + ] + ] + ], + "report": "custom_prefix.html", + "versions": [ + [ + "MULTIQC", + "multiqc", + "1.33" + ] + ] + } + ], + "timestamp": "2026-03-17T16:15:18.189023981", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } } } \ No newline at end of file diff --git a/modules/nf-core/multiqc/tests/nextflow.config b/modules/nf-core/multiqc/tests/nextflow.config index c537a6a3..374dfef2 100644 --- a/modules/nf-core/multiqc/tests/nextflow.config +++ b/modules/nf-core/multiqc/tests/nextflow.config @@ -1,5 +1,6 @@ process { withName: 'MULTIQC' { ext.prefix = null + ext.args = '-p' } } diff --git a/nextflow.config b/nextflow.config index fd2b2f62..910a3ec7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,16 +9,65 @@ // Global default params, used in configs params { - // TODO nf-core: Specify your pipeline's command line flags - // Input options - input = null + // Mandatory inputs + species = null + + // general options + keywords = "" + target_genes = "" + target_gene_file = null + platform = null + accessions_only = false + download_only = false + + // Local datasets + datasets = null + + // Expression atlas + skip_fetch_eatlas_accessions = false + fetch_geo_accessions = false + accessions = "" + excluded_accessions = "" + accessions_file = null + excluded_accessions_file = null + + // ID mapping + gprofiler_target_db = "ENSG" + gene_metadata = null + gene_id_mapping = null + skip_id_mapping = false + skip_cleaning_gene_ids = false + min_occurrence_freq = 0.1 + min_occurrence_quantile = 0.2 + + // sample filtering + max_zero_ratio = 0.9 + max_null_ratio = 0.9 + max_null_ratio_valid_sample = 0.75 + + // statistics + normalisation_method = 'tpm' + gene_length = null + gff = null + quantile_norm_target_distrib = 'uniform' + nb_sections = 20 + nb_candidates_per_section = 250 + missing_value_imputer = 'iterative' + + // stability scoring + skip_genorm = false + stability_score_weights = "0.5,0.5,0,0" + + // random sampling + random_sampling_seed = 42 + random_sampling_size = 5000 // MultiQC options - multiqc_config = null - multiqc_title = null - multiqc_logo = null - max_multiqc_email_size = '25.MB' - multiqc_methods_description = null + multiqc_config = null + multiqc_title = null + multiqc_logo = null + max_multiqc_email_size = '25.MB' + multiqc_methods_description = null // Boilerplate options outdir = null @@ -78,6 +127,17 @@ profiles { charliecloud.enabled = false apptainer.enabled = false } + micromamba { + conda.enabled = true + conda.useMicromamba = true + conda.channels = ['conda-forge', 'bioconda'] + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + } docker { docker.enabled = true conda.enabled = false @@ -161,8 +221,10 @@ profiles { apptainer.runOptions = '--nv' singularity.runOptions = '--nv' } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } + + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } + test_dataset_eatlas { includeConfig 'conf/test_dataset_eatlas.config' } } // Load nf-core custom profiles from different institutions @@ -171,13 +233,12 @@ profiles { // Load nf-core/stableexpression custom profiles from different institutions. includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null" - // Load nf-core/stableexpression custom profiles from different institutions. // TODO nf-core: Optionally, you can add a pipeline-specific nf-core config at https://github.com/nf-core/configs -// includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/pipeline/stableexpression.config" : "/dev/null" +includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/pipeline/stableexpression.config" : "/dev/null" -// Set default registry for Apptainer, Docker, Podman, Charliecloud and Singularity independent of -profile -// Will not be used unless Apptainer / Docker / Podman / Charliecloud / Singularity are enabled +// Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile +// Will not be used unless Apptainer / Docker / Podman / Singularity are enabled // Set to your registry if you have a mirror of containers apptainer.registry = 'quay.io' docker.registry = 'quay.io' @@ -234,11 +295,11 @@ manifest { // TODO nf-core: Update the field with the details of the contributors to your pipeline. New with Nextflow version 24.10.0 [ name: 'Olivier Coen', - affiliation: '', - email: '', - github: '', - contribution: [], // List of contribution types ('author', 'maintainer' or 'contributor') - orcid: '' + affiliation: 'CNRS / Université Paris-Saclay', + email: 'olivier.coen@universite-paris-saclay.fr', + github: 'OlivierCoen', + contribution: ['author', 'maintainer'], // List of contribution types ('author', 'maintainer' or 'contributor') + orcid: '0000-0003-3387-1040' ], ] homePage = 'https://github.com/nf-core/stableexpression' @@ -246,7 +307,7 @@ manifest { mainScript = 'main.nf' defaultBranch = 'main' nextflowVersion = '!>=25.04.0' - version = '1.0dev' + version = '1.0.0' doi = '' } diff --git a/nextflow_schema.json b/nextflow_schema.json index bc12467a..8109b426 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -2,7 +2,7 @@ "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/nf-core/stableexpression/main/nextflow_schema.json", "title": "nf-core/stableexpression pipeline parameters", - "description": "This pipeline is dedicated to finding the most stable genes across count datasets", + "description": "This pipeline is dedicated to identifying the most stable genes within a single or multiple expression dataset(s). This is particularly useful for identifying the most suitable RT-qPCR reference genes for a specific species.", "type": "object", "$defs": { "input_output_options": { @@ -10,25 +10,72 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": ["species", "outdir"], "properties": { - "input": { + "species": { "type": "string", - "format": "file-path", - "exists": true, - "schema": "assets/schema_input.json", - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/stableexpression/usage#samplesheet-input).", - "fa_icon": "fas fa-file-csv" + "description": "Scientifc species name (genus and species)", + "fa_icon": "fas fa-hippo", + "pattern": "^([a-zA-Z]+)[_ ]([a-zA-Z]+)[_ a-zA-Z]*$", + "help_text": "At least genus and species name should be supplied. Words should be separated by ` ` or `_`. Note that character case is ignored. Examples: `--species 'Arabidopsis thaliana'`, `--species 'homo_sapiens' or `--species MARMOTA_MARMOTA_MARMOTA`." }, "outdir": { "type": "string", "format": "directory-path", - "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", + "description": "Output directory", + "help_text": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" }, + "datasets": { + "type": "string", + "format": "file-path", + "exists": true, + "schema": "assets/schema_datasets.json", + "pattern": "^\\S+\\.(csv|yaml|yml|dat)$", + "description": "Custom datasets (counts + designs)", + "help_text": "Path to CSV / YAML file listing your own count datasets and their related experimental design. This file should be a comma-separated file with 4 columns (`counts`, `design`, `platform` and `normalised`). It must have a header row. Before running the pipeline, and for each count dataset provided by you, a design file with information about the samples in your experiment is required. Combine with --skip_fetch_eatlas_accessions if you only want to analyse your own count datasets. Otherwise, accessions from Expression Atlas and GEO will be fetched automatically. See [usage docs](https://nf-co.re/stableexpression/usage#samplesheet-input) for more information. ", + "fa_icon": "fas fa-file-csv" + }, + "keywords": { + "type": "string", + "description": "Keywords used for selecting specific Expression Atlas / GEO accessions", + "fa_icon": "fas fa-font", + "pattern": "(([a-zA-Z,]+))?", + "help_text": "Keywords (separated by commas) to use when retrieving specific experiments from Expression Atlas and / or GEO datasets. The pipeline will select all Expression Atlas experiments / GEO datasets that contain the provided keywords in their description of in one of the condition names. Example: `--keywords 'stress,flowering'`. This parameter is unused if --skip_fetch_eatlas_accessions is set and --fetch_geo_accessions is not set." + }, + "target_genes": { + "type": "string", + "description": "Target genes", + "fa_icon": "fas fa-arrows-alt-h", + "help_text": "One or multiple target genes (separated by commas). These can be gene IDs (as provided in your input datasets), Ensembl gene IDs, or gene symbols." + }, + "target_gene_file": { + "type": "string", + "description": "File containing target genes", + "format": "file-path", + "exists": true, + "fa_icon": "fas fa-arrows-alt-h", + "help_text": "File containing one or multiple target genes (one ID per line). These can be gene IDs (as provided in your input datasets), Ensembl gene IDs, or gene symbols." + }, + "platform": { + "type": "string", + "enum": ["rnaseq", "microarray"], + "description": "Only download from this platform", + "fa_icon": "fas fa-arrows-alt-h", + "help_text": "By default, data from both RNA-seq and Microarray platforms are downloaded. Setting this parameter applies a filter to get data from only one of the two platforms. This filter is only used while fetching appropriate Expression atlas / GEO accessions. It will not filter accessions provided directly by the user." + }, + "accessions_only": { + "type": "boolean", + "description": "Only get accessions from Expression Atlas / GEO and exit.", + "fa_icon": "far fa-stop-circle", + "help_text": "Use this option if you only want to get Expression Atlas accessions and skip the rest of the pipeline." + }, + "download_only": { + "type": "boolean", + "description": "Only get accessions from Expression Atlas / GEO and download the selected datasets.", + "fa_icon": "far fa-stop-circle", + "help_text": "Use this option if you only want to get Expression Atlas / GEO accessions, download the selected data, and skip the rest of the pipeline." + }, "email": { "type": "string", "description": "Email address for completion summary.", @@ -43,6 +90,269 @@ } } }, + "public_data_options": { + "title": "Public data options", + "type": "object", + "fa_icon": "fas fa-book-atlas", + "description": "Options for fetching experiment data from Expression Atlas / GEO.", + "properties": { + "skip_fetch_eatlas_accessions": { + "type": "boolean", + "fa_icon": "fas fa-cloud-arrow-down", + "description": "Skip fetching Expression Atlas accessions", + "help_text": "Expression Atlas accessions are automatically fetched by default. Set this parameter to skip this step." + }, + "fetch_geo_accessions": { + "type": "boolean", + "fa_icon": "fas fa-cloud-arrow-down", + "description": "Fetch GEO accessions from NCBI [Experimental]", + "help_text": "Set this parameter to fetch GEO accessions from NCBI. **This feature is experimental and may not work as expected**. Please report any issues to https://github.com/nf-core/stableexpression/issues." + }, + "accessions": { + "type": "string", + "pattern": "([A-Z0-9-]+,?)+", + "description": "Expression Atlas / GEO accession(s) to include", + "fa_icon": "fas fa-address-card", + "help_text": "Provide Expression Atlas / GEO accession(s) that you want to download. The accessions should be comma-separated. Example: `--accessions E-MTAB-552,E-GEOD-61690,GSE8165,GSE8161`. Combine with --skip_fetch_eatlas_accessions if you want only these accessions to be used. User provided accessions are prioritised over excluded accessions." + }, + "accessions_file": { + "type": "string", + "format": "file-path", + "exists": true, + "description": "File containing Expression Atlas / GEO accession(s) to download", + "fa_icon": "fas fa-file", + "help_text": "File containing Expression Atlas / GEO accession(s) that you want to download. One accession per line. Example: `--accessions_file included_accessions.txt`. Combine with --skip_fetch_accessions if you want only these accessions to be used. User provided accessions are prioritised over excluded accessions." + }, + "excluded_accessions": { + "type": "string", + "pattern": "([A-Z0-9-]+,?)+", + "description": "Expression Atlas accession(s) to exclude", + "fa_icon": "fas fa-id-card", + "help_text": "Provide Expression Atlas / GEO accession(s) that you want to exclude. The accessions should be comma-separated. Example: `--excluded_accessions E-MTAB-552,E-GEOD-61690`" + }, + "excluded_accessions_file": { + "type": "string", + "format": "file-path", + "exists": true, + "description": "File containing Expression Atlas accession(s) to exclude", + "fa_icon": "fas fa-file", + "help_text": "File containing Expression Atlas / GEO accession(s) that you want to exclude. One accession per line. Example: `--excluded_accessions_file excluded_accessions.txt`." + } + } + }, + "idmapping_options": { + "title": "ID mapping options", + "type": "object", + "fa_icon": "fas fa-map", + "description": "Options for mapping gene IDs.", + "properties": { + "skip_id_mapping": { + "type": "boolean", + "description": "Skip g:Profiler ID mapping step", + "fa_icon": "fas fa-ban", + "help": "If you don't want to map gene IDs with g:Profiler, you can skip this step by providing `--skip_id_mapping`. It can be in particular useful if the g:Profiler is down and if you already have a custom mapping file." + }, + "skip_cleaning_gene_ids": { + "type": "boolean", + "description": "Skip cleaning gene IDs step", + "fa_icon": "fas fa-ban", + "help": "If you don't want to clean gene IDs, you can skip this step by providing `--skip_cleaning_gene_ids`. Note that gene ID cleaning is automatically disabled with `--skip_id_mapping`." + }, + "gprofiler_target_db": { + "type": "string", + "description": "Experimental: target database for g:Profiler", + "fa_icon": "fas fa-divide", + "enum": ["ENSG", "ENTREZGENE", "UNIPROTSPTREMBL", "UNIPROTSWISSPROT"], + "default": "ENSG", + "help_text": "Experimental: target database for g:Profiler. You can see the full list of available target databases at https://biit.cs.ut.ee/gprofiler/convert." + }, + "gene_id_mapping": { + "type": "string", + "format": "file-path", + "exists": true, + "schema": "assets/schema_gene_id_mapping.json", + "mimetype": "text/csv", + "pattern": "^\\S+\\.(csv|dat)$", + "description": "Custom gene id mapping file", + "help_text": "Path to comma-separated file containing custom gene id mappings. Each row represents a mapping from the original gene ID in your count datasets to a prefered gene ID. The mapping file should be a comma-separated file with 2 columns (original_gene_id and gene_id) and a header row.", + "fa_icon": "fas fa-file" + }, + "gene_metadata": { + "type": "string", + "format": "file-path", + "exists": true, + "schema": "assets/schema_gene_metadata.json", + "mimetype": "text/csv", + "pattern": "^\\S+\\.(csv|dat)$", + "description": "Custom gene metadata file", + "help_text": "Path to comma-separated file containing custom gene metadata information. Each row represents a gene and links its gene ID to its name and description. The metadata file should be a comma-separated file with 3 columns (gene_id, name and description) and a header row.", + "fa_icon": "fas fa-file" + }, + "min_occurrence_quantile": { + "type": "number", + "description": "Minimum quantile for the frequency of occurrence", + "fa_icon": "fas fa-battery-three-quarters", + "minimum": 0, + "maximum": 1, + "default": 0.2, + "help_text": "To avoid genes that are rarely observed, genes less represented than the specified quantile will be filtered out. For example, value of 0.2 means that the 20% less represented will be filtered out. This filter is applied before using the absolute filter `--min_occurrence_freq`." + }, + "min_occurrence_freq": { + "type": "number", + "description": "Minimum frequency of occurrence among all datasets", + "fa_icon": "fas fa-battery-three-quarters", + "minimum": 0, + "maximum": 1, + "default": 0.1, + "help_text": "To avoid genes that are rarely observed, genes showing a frequency of occurrence below this threshold will be filtered out." + } + } + }, + "sample_filtering_options": { + "title": "Sample filtering options", + "type": "object", + "fa_icon": "fas fa-chart-line", + "description": "Options for filtering samples based on their expression levels.", + "properties": { + "max_zero_ratio": { + "type": "number", + "description": "Maximum ratio of zero counts to total counts", + "fa_icon": "fas fa-divide", + "minimum": 0, + "maximum": 1, + "default": 0.9, + "help_text": "A filter is set up to avoid samples that contain a high proportion of zero counts. All samples with a ratio of zero counts to total counts above this threshold will be filtered out." + }, + "max_null_ratio": { + "type": "number", + "description": "Maximum ratio of null values", + "fa_icon": "fas fa-divide", + "minimum": 0, + "maximum": 1, + "default": 0.9, + "help_text": "A filter is set up to avoid samples that contain a high proportion of zero counts. All samples with a ratio of zero counts to total counts above this threshold will be filtered out." + }, + "max_null_ratio_valid_sample": { + "type": "number", + "description": "Maximum ratio of null values in a sample for it to be considered in the computation of the null value malus", + "fa_icon": "fas fa-divide", + "minimum": 0, + "maximum": 1, + "default": 0.75, + "help_text": "After filtering out samples with a very high proportion of zero counts (via `--max_null_ratio`), a second filter is set up to avoid samples that contain a substantial proportion of null values to be considered in the malus of null values comprised in the stability score." + } + } + }, + "statistical_options": { + "title": "Statistics options", + "type": "object", + "fa_icon": "fas fa-chart-line", + "description": "Statistical options for normalisation and calculation of gene expression variation.", + "properties": { + "normalisation_method": { + "type": "string", + "description": "Count normalisation method", + "fa_icon": "fas fa-divide", + "enum": ["tpm", "cpm"], + "default": "tpm", + "help_text": "Raw RNAseq data must be normalised before further processing. `tmp offers a more accurate representation of gene expression levels as it is unbiased toward gene length. However, you can choose `cpm` if you do not have access to a genome annotation." + }, + "gff": { + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/tsv", + "pattern": "^\\S+\\.(gff|dat)$", + "description": "Genome annotation file (GFF format)", + "help_text": "Path to genome annotation file (GFF format). Cannot be compressed.", + "fa_icon": "fas fa-file" + }, + "gene_length": { + "type": "string", + "format": "file-path", + "exists": true, + "schema": "assets/schema_gene_length.json", + "mimetype": "text/csv", + "pattern": "^\\S+\\.(csv|tsv|dat)$", + "description": "Gene length file", + "help_text": "Path to comma-separated file containing gene lengths. Each row represents a gene and gives the length of its longest transcript. The file should be a comma-separated file with 2 columns (gene_id and length) and a header row.", + "fa_icon": "fas fa-file" + }, + "quantile_norm_target_distrib": { + "type": "string", + "description": "Target distribution for quantile normalisation", + "fa_icon": "fas fa-chart-bar", + "enum": ["uniform", "normal"], + "default": "uniform", + "help_text": "In order to compare counts between samples and different datasets, all normalised counts are quantile normalised and mapped to a specific distribution. The pipeline uses scikit-learn's quantile_transform function. You can select the target distribution to map counts to." + }, + "missing_value_imputer": { + "type": "string", + "description": "Type of imputation method to use for missing values", + "fa_icon": "fas fa-battery-three-quarters", + "enum": ["iterative", "knn", "gene_mean"], + "default": "iterative", + "help_text": "The pipeline provides three options for imputing missing values: iterative, k-nearest neighbors, and gene mean. Iterative imputation uses a bayesian iterative algorithm to fill in missing values. K-nearest neighbors imputation uses a k-nearest neighbors algorithm to fill in missing values. Gene mean imputation is a very basic method that replaces missing values with the mean expression level of the gene across all samples." + } + } + }, + "stability_scoring_options": { + "title": "Stability scoring options", + "type": "object", + "fa_icon": "fas fa-chart-line", + "description": "Options relative to assessment of stability for each gene.", + "properties": { + "nb_sections": { + "type": "integer", + "description": "Number of sections to divide genes into for stability scoring.", + "fa_icon": "fas fa-sort-numeric-up-alt", + "minimum": 1, + "help_text": "All genes are divided into sections based on their expression levels. Set this parameter to modify the number of sections." + }, + "nb_candidates_per_section": { + "type": "integer", + "description": "Number of candidate genes to keep for stability scoring in each section", + "fa_icon": "fas fa-sort-numeric-up-alt", + "minimum": 1, + "help_text": "Number of candidate genes to keep in each section for stability scoring. Within each section, the top candidates are selected based on the descriptor chosen with `--candidate_selection_descriptor`." + }, + "skip_genorm": { + "type": "boolean", + "description": "Run Genorm", + "fa_icon": "fas fa-check", + "help": "Skip Genorm by setting this parameter to true. In this case, by default, only Normfinder will participate in the stability score." + }, + "stability_score_weights": { + "type": "string", + "description": "Weights for stability score calculation", + "fa_icon": "fas fa-balance-scale", + "help_text": "Weights for Normfinder / Genorm / Coefficient of Variation (CV) / Robust Coefficient of Variation on Median (RCVM) respectively. Must be a comma-separated string. Example: 0.5,0.5,0.0,0", + "pattern": "^\\d+(\\.\\d+)?,\\d+(\\.\\d+)?,\\d+(\\.\\d+)?,\\d+(\\.\\d+)?$" + } + } + }, + "scalability_options": { + "title": "Scalability options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Options to improve pipeline scalability and robustness", + "properties": { + "random_sampling_size": { + "type": "integer", + "description": "Number of public dataset samples to choose randomly before downloading.", + "fa_icon": "fas fa-sort-numeric-up-alt", + "minimum": 1, + "help_text": "When dealing with species for which there is a large number (eg. >10000) of samples considering all the downloaded datasets, users may encounter RAM issues (eg. errors with `137` exit codes). In such cases, it is recommended to sample a random subset of these datasets to reduce the computational load. A first subsampling is performedduring the search for Expression Atlas accessions. In case there is still room for datasets and if the `--fetch_geo_accessions` flag was set, a second ssubsampling is performed during the search for NCBI GEO accessions." + }, + "random_sampling_seed": { + "type": "integer", + "description": "Seed for dataset random sampling.", + "fa_icon": "fas fa-sort-numeric-up-alt", + "minimum": 0, + "help_text": "Seed for dataset random sampling. This ensures reproducibility of the random sampling process. Changing the seed will result in a different random sample being selected." + } + } + }, "institutional_config_options": { "title": "Institutional config options", "type": "object", @@ -205,6 +515,24 @@ { "$ref": "#/$defs/input_output_options" }, + { + "$ref": "#/$defs/public_data_options" + }, + { + "$ref": "#/$defs/idmapping_options" + }, + { + "$ref": "#/$defs/sample_filtering_options" + }, + { + "$ref": "#/$defs/statistical_options" + }, + { + "$ref": "#/$defs/stability_scoring_options" + }, + { + "$ref": "#/$defs/scalability_options" + }, { "$ref": "#/$defs/institutional_config_options" }, diff --git a/nf-test.config b/nf-test.config index 3a1fff59..a0a009fd 100644 --- a/nf-test.config +++ b/nf-test.config @@ -6,19 +6,26 @@ config { workDir System.getenv("NFT_WORKDIR") ?: ".nf-test" // location of an optional nextflow.config file specific for executing tests + testsDir "tests" + workDir ".nf-test" configFile "tests/nextflow.config" // ignore tests coming from the nf-core/modules repo ignore 'modules/nf-core/**/tests/*', 'subworkflows/nf-core/**/tests/*' // run all test with defined profile(s) from the main nextflow.config - profile "test" + //profile "apptainer" // list of filenames or patterns that should be trigger a full test run triggers 'nextflow.config', 'nf-test.config', 'conf/test.config', 'tests/nextflow.config', 'tests/.nftignore' // load the necessary plugins + requires ( + "nf-test": "0.9.3" + ) plugins { load "nft-utils@0.0.3" + load "nft-csv@0.1.0" } + } diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..5fbcbac4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,17 @@ +[tool.ruff.lint] +# Avoid enforcing line-length violations (`E501`) +ignore = ["E501"] + +[tool.ruff.format] +# Use single quotes when formatting. +quote-style = "double" +indent-style = "space" + +[tool.basedpyright] +reportUnusedCallResult = "none" +reportUnknownMemberType = "none" +reportUnknownVariableType = "none" +reportUnknownParameterType = "none" +reportUnknownArgumentType = "none" +reportAny = "none" +reportImplicitRelativeImport = "none" diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 0c98feb8..e99a20e2 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -21,9 +21,9 @@ { "@id": "./", "@type": "Dataset", - "creativeWorkStatus": "InProgress", - "datePublished": "2025-11-20T09:32:21+00:00", - "description": "

\n \n \n \"nf-core/stableexpression\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/stableexpression)\n[![GitHub Actions CI Status](https://github.com/nf-core/stableexpression/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/stableexpression/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/stableexpression/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/stableexpression/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/stableexpression/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/stableexpression)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23stableexpression-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/stableexpression)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/stableexpression** is a bioinformatics pipeline that ...\n\n\n\n\n2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n\n\nNow, you can run the pipeline using:\n\n\n\n```bash\nnextflow run nf-core/stableexpression \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/stableexpression/usage) and the [parameter documentation](https://nf-co.re/stableexpression/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/stableexpression/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/stableexpression/output).\n\n## Credits\n\nnf-core/stableexpression was originally written by Olivier Coen.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#stableexpression` channel](https://nfcore.slack.com/channels/stableexpression) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\n\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "creativeWorkStatus": "Stable", + "datePublished": "2026-03-14T09:55:43+00:00", + "description": "

\n \n \n \"nf-core/stableexpression\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/stableexpression)\n[![GitHub Actions CI Status](https://github.com/nf-core/stableexpression/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/stableexpression/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/stableexpression/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/stableexpression/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/stableexpression/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with apptainer](https://custom-icon-badges.demolab.com/badge/run%20with-apptainer-4545?logo=apptainer&color=teal&labelColor=000000)](https://apptainer.org/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/stableexpression)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23stableexpression-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/stableexpression)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/stableexpression** is a bioinformatics pipeline aiming to aggregate multiple count datasets for a specific species and find the most stable genes. The datasets can be either downloaded from public databases (EBI, NCBI) or provided directly by the user. Both RNA-seq and Microarray count datasets can be utilised.\n\n

\n \n

\n\nIt takes as main inputs :\n\n- a species name (mandatory)\n- keywords for Expression Atlas / GEO search (optional)\n- a CSV input file listing your own raw / normalised count datasets (optional).\n\n**Use cases**:\n\n- **find the most suitable genes as RT-qPCR reference genes for a specific species (and optionally specific conditions)**\n- download all Expression Atlas and / or NCBI GEO datasets for a species (and optionally keywords)\n\n## Pipeline overview\n\nThe pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:\n\n#### 1. Get accessions from public databases\n\n- Get [Expression Atlas](https://www.ebi.ac.uk/gxa/home) dataset accessions corresponding to the provided species (and optionally keywords)\n This step is run by default but is optional. Set `--skip_fetch_eatlas_accessions` to skip it.\n- Get NBCI [GEO](https://www.ncbi.nlm.nih.gov/gds) **microarray** dataset accessions corresponding to the provided species (and optionally keywords)\n This is optional and **NOT** run by default. Set `--fetch_geo_accessions` to run it.\n\n#### 2. Download data (see [usage](./conf/usage.md#3-provide-your-own-accessions))\n\n- Download [Expression Atlas](https://www.ebi.ac.uk/gxa/home) data if any\n- Download NBCI [GEO](https://www.ncbi.nlm.nih.gov/gds) data if any\n\n> [!NOTE]\n> At this point, datasets downloaded from public databases are merged with datasets provided by the user using the `--datasets` parameter. See [usage](./conf/usage.md#4-use-your-own-expression-datasets) for more information about local datasets.\n\n#### 3. ID Mapping (see [usage](./conf/usage.md#5-custom-gene-id-mapping--metadata))\n\n- Gene IDs are cleaned\n- Map gene IDS to NCBI Entrez Gene IDS (or Ensembl IDs) for standardisation among datasets using [g:Profiler](https://biit.cs.ut.ee/gprofiler/gost) (run by default; optional)\n- Rare genes are filtered out\n\n#### 4. Sample filtering\n\nSamples that show too high ratios of zeros or missing values are removed from the analysis.\n\n#### 5. Normalisation of expression\n\n- Normalize RNAseq raw data using TPM (necessitates downloading the corresponding genome and computing transcript lengths) or CPM.\n- Perform quantile normalisation on each dataset separately using [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.quantile_transform.html)\n\n#### 6. Merge all data\n\nAll datasets are merged into one single dataframe.\n\n#### 7. Imputation of missing values\n\nMissing values are replaced by imputed values using a specific algorithm provided by [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.quantile_transform.html). The user can choose the method of imputation with the `--missing_value_imputer` parameter.\n\n#### 8. General statistics for each gene\n\nBase statistics are computed for each gene, platform-wide and for each platform (RNAseq and microarray).\n\n#### 9. Scoring\n\n- The whole list of genes is divided in multiple sections, based on their expression level.\n- Based on the coefficient of variation, a shortlist of candidates genes is extracted for each section.\n- Run optimised, scalable version of [Normfinder](https://www.moma.dk/software/normfinder)\n- Run optimised, scalable version of [Genorm](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2002-3-7-research0034) (run by default; optional)\n- Compute stability scores for each candidate gene\n\n#### 10. Reporting\n\n- Result aggregation\n- Make [`MultiQC`](http://multiqc.info/) report\n- Prepare [Dash Plotly](https://dash.plotly.com/) app for further investigation of gene / sample counts\n\n## Test pipeline\n\nYou can test the execution of the pipeline locally with:\n\n```bash\nnextflow run nf-core/stableexpression -profile test,\n```\n\n## Basic usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nTo search the most stable genes in a species considering all public datasets, simply run:\n\n```bash\nnextflow run nf-core/stableexpression \\\n -profile \\\n --species \\\n --outdir \\\n -resume\n```\n\n## More advanced usage\n\nFor more specific scenarios, like:\n\n- **fetching only specific conditions**\n- **using your own expression dataset(s)**\n\nplease refer to the [usage documentation](https://nf-co.re/stableexpression/usage).\n\n## Resource allocation\n\nFor setting pipeline CPU / memory usage, see [here](./docs/configuration.md).\n\n## Profiles\n\nSee [here](https://nf-co.re/stableexpression/usage#profiles) for more information about profiles.\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/stableexpression/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/stableexpression/output).\n\n## Support us\n\nIf you like nf-core/stableexpression, please make sure you give it a star on GitHub!\n\n[![stars - stableexpression](https://img.shields.io/github/stars/nf-core/stableexpression?style=social)](https://github.com/nf-core/stableexpression)\n\n## Credits\n\nnf-core/stableexpression was originally written by Olivier Coen.\n\nWe thank the following people for their assistance in the development of this pipeline:\n\n- R\u00e9my Costa\n- Shaheen Acheche\n- Janine Soares\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#stableexpression` channel](https://nfcore.slack.com/channels/stableexpression) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" @@ -31,6 +31,9 @@ { "@id": "assets/" }, + { + "@id": "bin/" + }, { "@id": "conf/" }, @@ -43,6 +46,9 @@ { "@id": "modules/" }, + { + "@id": "modules/local/" + }, { "@id": "modules/nf-core/" }, @@ -99,7 +105,7 @@ }, "mentions": [ { - "@id": "#ea87a9e0-dad4-4149-b745-000686183a2c" + "@id": "#6aa6a373-9bb0-4502-a8f4-2fce1f6296ee" } ], "name": "nf-core/stableexpression" @@ -121,31 +127,50 @@ }, { "@id": "main.nf", - "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"], + "@type": [ + "File", + "SoftwareSourceCode", + "ComputationalWorkflow" + ], "creator": [ { "@id": "https://orcid.org/0000-0003-3387-1040" } ], "dateCreated": "", - "dateModified": "2025-11-20T09:32:21Z", + "dateModified": "2026-03-14T10:55:43Z", "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", - "keywords": ["nf-core", "nextflow", "expression", "housekeeping-genes", "qpcr-analysis"], - "license": ["MIT"], + "keywords": [ + "nf-core", + "nextflow", + "expression", + "housekeeping-genes", + "qpcr-analysis" + ], + "license": [ + "MIT" + ], "maintainer": [ { "@id": "https://orcid.org/0000-0003-3387-1040" } ], - "name": ["nf-core/stableexpression"], + "name": [ + "nf-core/stableexpression" + ], "programmingLanguage": { "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow" }, "sdPublisher": { "@id": "https://nf-co.re/" }, - "url": ["https://github.com/nf-core/stableexpression", "https://nf-co.re/stableexpression/dev/"], - "version": ["1.0dev"] + "url": [ + "https://github.com/nf-core/stableexpression", + "https://nf-co.re/stableexpression/1.0.0/" + ], + "version": [ + "1.0.0" + ] }, { "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow", @@ -160,11 +185,11 @@ "version": "!>=25.04.0" }, { - "@id": "#ea87a9e0-dad4-4149-b745-000686183a2c", + "@id": "#6aa6a373-9bb0-4502-a8f4-2fce1f6296ee", "@type": "TestSuite", "instance": [ { - "@id": "#7460c1e2-3fe8-4d1a-bffb-31ea3d133ade" + "@id": "#2fa3572f-894b-4153-a0a8-dca4386f0cea" } ], "mainEntity": { @@ -173,7 +198,7 @@ "name": "Test suite for nf-core/stableexpression" }, { - "@id": "#7460c1e2-3fe8-4d1a-bffb-31ea3d133ade", + "@id": "#2fa3572f-894b-4153-a0a8-dca4386f0cea", "@type": "TestInstance", "name": "GitHub Actions workflow for testing nf-core/stableexpression", "resource": "repos/nf-core/stableexpression/actions/workflows/nf-test.yml", @@ -195,6 +220,11 @@ "@type": "Dataset", "description": "Additional files" }, + { + "@id": "bin/", + "@type": "Dataset", + "description": "Scripts that must be callable from a pipeline process" + }, { "@id": "conf/", "@type": "Dataset", @@ -215,6 +245,11 @@ "@type": "Dataset", "description": "Modules used by the pipeline" }, + { + "@id": "modules/local/", + "@type": "Dataset", + "description": "Pipeline-specific modules" + }, { "@id": "modules/nf-core/", "@type": "Dataset", @@ -308,4 +343,4 @@ "name": "Olivier Coen" } ] -} +} \ No newline at end of file diff --git a/subworkflows/local/dataset_analysis/main.nf b/subworkflows/local/dataset_analysis/main.nf new file mode 100644 index 00000000..66ec06ed --- /dev/null +++ b/subworkflows/local/dataset_analysis/main.nf @@ -0,0 +1,23 @@ +include { COMPUTE_DATASET_STATISTICS as DESCRIPTIVE_STATISTICS } from '../../../modules/local/compute_dataset_statistics' + +/* +======================================================================================== + SUBWORKFLOW TO COMPUTE VARIOUS STATISTICS AT THE DATASET / SAMPLE LEVEL +======================================================================================== +*/ + +workflow DATASET_ANALYSIS { + + take: + ch_counts + + main: + + // ----------------------------------------------------------------- + // COMPUTE VARIOUS STATISTICS AT THE SAMPLE LEVEL + // ----------------------------------------------------------------- + + DESCRIPTIVE_STATISTICS ( ch_counts ) + + +} diff --git a/subworkflows/local/download_public_datasets/main.nf b/subworkflows/local/download_public_datasets/main.nf new file mode 100644 index 00000000..9e6d2a2f --- /dev/null +++ b/subworkflows/local/download_public_datasets/main.nf @@ -0,0 +1,66 @@ +include { EXPRESSIONATLAS_GETDATA as EXPRESSION_ATLAS } from '../../../modules/local/expressionatlas/getdata' +include { GEO_GETDATA as GEO } from '../../../modules/local/geo/getdata' + +include { addDatasetIdToMetadata } from '../utils_nfcore_stableexpression_pipeline' +include { groupFilesByDatasetId } from '../utils_nfcore_stableexpression_pipeline' +include { augmentMetadata } from '../utils_nfcore_stableexpression_pipeline' + +/* +======================================================================================== + SUBWORKFLOW TO DOWNLOAD EXPRESSION ATLAS AND NCBI GEO DATASETS +======================================================================================== +*/ + +workflow DOWNLOAD_PUBLIC_DATASETS { + + take: + species + ch_accessions + + + main: + + ch_datasets = channel.empty() + ch_fetched_accessions = channel.empty() + + ch_accessions = ch_accessions + .branch { acc -> + eatlas: acc.startsWith('E-') + geo: acc.startsWith('GSE') + } + + // ------------------------------------------------------------------------------------ + // DOWNLOAD EXPRESSION ATLAS DATASETS + // ------------------------------------------------------------------------------------ + + // Downloading Expression Atlas data for each accession in ch_accessions + EXPRESSION_ATLAS( ch_accessions.eatlas ) + + // ------------------------------------------------------------------------------------ + // DOWNLOAD GEO DATASETS + // ------------------------------------------------------------------------------------ + + // Downloading GEO datasets for each accession in ch_accessions + GEO( + ch_accessions.geo, + species + ) + + ch_downloaded_counts = EXPRESSION_ATLAS.out.counts.mix ( GEO.out.counts ) + ch_downloaded_design = EXPRESSION_ATLAS.out.design.mix ( GEO.out.design ) + + // adding dataset id (accession + data_type) in the file meta + // flattening in case multiple files are returned at once + ch_counts = addDatasetIdToMetadata( ch_downloaded_counts.flatten() ) + ch_design = addDatasetIdToMetadata( ch_downloaded_design.flatten() ) + + // adding design files to the meta of their respective count files + ch_datasets = groupFilesByDatasetId( ch_design, ch_counts ) + + // adding normalisation state in the meta + ch_datasets = augmentMetadata( ch_datasets ) + + emit: + datasets = ch_datasets + +} diff --git a/subworkflows/local/expression_normalisation/main.nf b/subworkflows/local/expression_normalisation/main.nf new file mode 100644 index 00000000..84130edd --- /dev/null +++ b/subworkflows/local/expression_normalisation/main.nf @@ -0,0 +1,83 @@ +include { NORMALISATION_COMPUTE_CPM as COMPUTE_CPM } from '../../../modules/local/normalisation/compute_cpm' +include { NORMALISATION_COMPUTE_TPM as COMPUTE_TPM } from '../../../modules/local/normalisation/compute_tpm' +include { QUANTILE_NORMALISATION } from '../../../modules/local/quantile_normalisation' + +include { GET_TRANSCRIPT_LENGTHS } from '../../../subworkflows/local/get_transcript_lengths' + +/* +======================================================================================== + SUBWORKFLOW TO NORMALISE AND HARMONISE EXPRESSION DATASETS +======================================================================================== +*/ + +workflow EXPRESSION_NORMALISATION { + + take: + species + ch_datasets + normalisation_method + quantile_norm_target_distrib + gff_file + gene_length_file + + main: + + // + // MODULE: normalisation of raw count datasets (including downloaded RNA-seq datasets) + // at the same time, removing genes that show only zero counts + // + + ch_datasets = ch_datasets.branch { + meta, file -> + raw: meta.normalised == false + normalised: meta.normalised == true + } + + ch_raw_rnaseq_datasets_to_normalise = ch_datasets.raw.filter { meta, file -> meta.platform == 'rnaseq' } + + if ( normalisation_method == 'tpm' ) { + + if ( gene_length_file ) { + + ch_gene_length_file = channel.fromPath( gene_length_file, checkIfExists: true ) + + } else { + + // download genome annotation + // and computing length of the longest transcript gene per gene + GET_TRANSCRIPT_LENGTHS( + species, + gff_file + ) + ch_gene_length_file = GET_TRANSCRIPT_LENGTHS.out.csv + + } + + COMPUTE_TPM( + ch_raw_rnaseq_datasets_to_normalise, + ch_gene_length_file + ) + ch_raw_rnaseq_datasets_normalised = COMPUTE_TPM.out.counts + + } else { // 'cpm' + + COMPUTE_CPM( ch_raw_rnaseq_datasets_to_normalise ) + ch_raw_rnaseq_datasets_normalised = COMPUTE_CPM.out.counts + + } + + // + // MODULE: Quantile normalisation + // + + // putting all normalised count datasets together and performing quantile normalisation + QUANTILE_NORMALISATION ( + ch_datasets.normalised.mix( ch_raw_rnaseq_datasets_normalised ), + quantile_norm_target_distrib + ) + + + emit: + counts = QUANTILE_NORMALISATION.out.counts + +} diff --git a/subworkflows/local/gene_statistics/main.nf b/subworkflows/local/gene_statistics/main.nf new file mode 100644 index 00000000..3237e7dd --- /dev/null +++ b/subworkflows/local/gene_statistics/main.nf @@ -0,0 +1,47 @@ +include { COMPUTE_GENE_STATISTICS as GLOBAL } from '../../../modules/local/compute_gene_statistics' +include { COMPUTE_GENE_STATISTICS as PLATFORM } from '../../../modules/local/compute_gene_statistics' + +/* +======================================================================================== + SUBWORKFLOW TO COMPUTE STATISTICS FOR ALL GENES +======================================================================================== +*/ + +workflow GENE_STATISTICS { + + take: + ch_all_imputed_counts + ch_all_counts + ch_platform_counts + ch_ratio_nulls_per_sample_file + max_null_ratio_valid_sample + + main: + + // ----------------------------------------------------------------- + // PLATFORM-SPECIFIC STATISTICS + // ----------------------------------------------------------------- + + // platform counts have not been imputed + PLATFORM( + ch_platform_counts.map{ meta, file -> [ meta, file, [] ] }, + ch_ratio_nulls_per_sample_file.collect(), + max_null_ratio_valid_sample + ) + + + // ----------------------------------------------------------------- + // ALL DATA + // ----------------------------------------------------------------- + + GLOBAL( + ch_all_counts.join( ch_all_imputed_counts ).collect(), + ch_ratio_nulls_per_sample_file.collect(), + max_null_ratio_valid_sample + ) + + emit: + stats = GLOBAL.out.stats + platform_stats = PLATFORM.out.stats + +} diff --git a/subworkflows/local/genorm/main.nf b/subworkflows/local/genorm/main.nf new file mode 100644 index 00000000..6a88703e --- /dev/null +++ b/subworkflows/local/genorm/main.nf @@ -0,0 +1,102 @@ +// +// Subworkflow with functionality specific to the nf-core/stableexpression pipeline +// + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { MAKE_CHUNKS } from '../../../modules/local/genorm/make_chunks' +include { CROSS_JOIN } from '../../../modules/local/genorm/cross_join' +include { EXPRESSION_RATIO } from '../../../modules/local/genorm/expression_ratio' +include { RATIO_STANDARD_VARIATION } from '../../../modules/local/genorm/ratio_standard_variation' +include { COMPUTE_M_MEASURE } from '../../../modules/local/genorm/compute_m_measure' + +/* +======================================================================================== + SUBWORKFLOW TO COMPUTE PAIRWISE GENE VARIATION (ADAPTED VERSION OF GENORM) +======================================================================================== +*/ + +workflow GENORM { + + take: + ch_counts + + + main: + + // ----------------------------------------------------------------- + // MAKE CHUNKS OF GENE COUNTS + // ----------------------------------------------------------------- + + MAKE_CHUNKS( ch_counts ) + + // we need to flatten to set each chunk file as a separate item in the channel + ch_count_chunks = getUniqueFilePairs( MAKE_CHUNKS.out.chunks.transpose() ) + + // ----------------------------------------------------------------- + // CROSS JOIN CHUNKS + // ----------------------------------------------------------------- + + CROSS_JOIN( ch_count_chunks ) + + // ----------------------------------------------------------------- + // PAIRWISE EXPRESSION RATIOS + // ----------------------------------------------------------------- + + EXPRESSION_RATIO( CROSS_JOIN.out.data ) + + // ----------------------------------------------------------------- + // STANDARD VARIATION OF EXPRESSION RATIOS + // ----------------------------------------------------------------- + + RATIO_STANDARD_VARIATION( EXPRESSION_RATIO.out.data ) + + // ----------------------------------------------------------------- + // COMPUTE M-MEASURE + // ----------------------------------------------------------------- + + ch_ratio_files = RATIO_STANDARD_VARIATION.out.data + .map{ meta, file -> [ [ section: meta.section ], file ] } + .groupTuple() + + COMPUTE_M_MEASURE( + ch_counts.join( ch_ratio_files ) + ) + + emit: + m_measures = COMPUTE_M_MEASURE.out.m_measures + +} + + +/* +======================================================================================== + FUNCTIONS +======================================================================================== +*/ + +// +// Generate channels consisting of unique pairs of files +// +def getUniqueFilePairs( ch_count_chunks ) { + + def ch_count_chunks_with_indexes = ch_count_chunks + .map { meta, file -> [meta, file.name.tokenize('.')[1], file] } // extract file index + + return ch_count_chunks_with_indexes + .combine( // full cartesian product with itself, using the meta map as key + ch_count_chunks_with_indexes, + by: 0 + ) + .filter { + meta, i, file_i, j, file_j -> i <= j } // keeps only pairs where i <= j + .map { + meta, i, file_i, j, file_j -> + def new_meta = meta + [ index_1: i, index_2: j ] // puts indexes in a meta tuple + [ new_meta, file_i, file_j ] + } +} diff --git a/subworkflows/local/get_public_accessions/main.nf b/subworkflows/local/get_public_accessions/main.nf new file mode 100644 index 00000000..73558f6d --- /dev/null +++ b/subworkflows/local/get_public_accessions/main.nf @@ -0,0 +1,137 @@ +include { EXPRESSIONATLAS_GETACCESSIONS as EXPRESSION_ATLAS } from '../../../modules/local/expressionatlas/getaccessions' +include { GEO_GETACCESSIONS as GEO } from '../../../modules/local/geo/getaccessions' + +/* +======================================================================================== + SUBWORKFLOW TO FETCH EXPRESSION ATLAS AND NCBI GEO ACCESSIONS +======================================================================================== +*/ + +workflow GET_PUBLIC_ACCESSIONS { + + take: + species + skip_fetch_eatlas_accessions + fetch_geo_accessions + platform + keywords + ch_accessions + ch_accessions_file + ch_excluded_accessions + ch_excluded_accessions_file + random_sampling_size + random_sampling_seed + outdir + + main: + + ch_fetched_eatlas_accessions = channel.empty() + ch_fetched_geo_accessions = channel.empty() + ch_sampling_quota = channel.of( "ok" ) + + // ----------------------------------------------------------------- + // GET EATLAS ACCESSIONS + // ----------------------------------------------------------------- + + // fetching Expression Atlas accessions if applicable + if ( !skip_fetch_eatlas_accessions ) { + + // getting Expression Atlas accessions given a species name and keywords + // keywords can be an empty string + EXPRESSION_ATLAS( + species, + keywords, + platform?: [], + random_sampling_size?: [], + random_sampling_seed?: [] + ) + + ch_fetched_eatlas_accessions = EXPRESSION_ATLAS.out.accessions.splitText() + ch_sampling_quota = EXPRESSION_ATLAS.out.sampling_quota + + } + + // ------------------------------------------------------------------------------------ + // GET GEO ACCESSIONS + // ------------------------------------------------------------------------------------ + + // fetching GEO accessions if applicable + if ( fetch_geo_accessions ) { + + // all Expression Atlas accessions starting with E-GEOD- are imported from GEO + // we do not want to collect these GEO data if we already get them from Expression Atlas + ch_excluded_eatlas_accessions_file = ch_fetched_eatlas_accessions + .filter { accession -> accession.startsWith("E-GEOD-") } + .map { accession -> accession.replace("E-GEOD-", "GSE") } + .collectFile( + name: 'excluded_geo_accessions.txt', + storeDir: "${outdir}/geo/", + sort: true, + newLine: true + ) + .ifEmpty( [] ) + + // trick to avoid fetching accessions from GEO when the sampling quota is already exceeded + ch_species = channel.of( species ) + .combine( ch_sampling_quota ) + .filter { species_name, quota -> quota == "ok" } + .map { species_name, quota -> species_name } + + // getting GEO accessions given a species name and keywords + // keywords can be an empty string + GEO( + ch_species, + keywords, + platform?: [], + ch_excluded_eatlas_accessions_file, + random_sampling_size?: [], + random_sampling_seed?: [] + ) + + ch_fetched_geo_accessions = GEO.out.accessions.splitText() + } + + // ----------------------------------------------------------------- + // MERGING AND EXCLUDING UNWANTED ACCESSIONS + // ----------------------------------------------------------------- + + // getting accessions to exclude and preparing in the right format + ch_excluded_accessions = ch_excluded_accessions + .mix( ch_excluded_accessions_file.splitText() ) + .unique() + .map { acc -> acc.trim() } + .toList() + .map { lst -> [lst] } // list of lists : mandatory when combining in the next step + + ch_fetched_public_accessions = ch_fetched_eatlas_accessions + .mix( ch_fetched_geo_accessions ) + .map { acc -> acc.trim() } + .filter { acc -> + (acc.startsWith('E-') || acc.startsWith('GSE')) && !acc.startsWith('E-PROT-') + } + .combine ( ch_excluded_accessions ) + .filter { accession, excluded_accessions -> !(accession in excluded_accessions) } + .map { accession, excluded_accessions -> accession } + + // ----------------------------------------------------------------- + // ADDING USER PROVIDED ACCESSIONS + // ----------------------------------------------------------------- + + ch_input_accessions = ch_accessions + .mix( ch_accessions_file.splitText() ) + .unique() + .map { acc -> acc.trim() } + + // appending to accessions provided by the user + // ensures that no accessions is present twice (provided by the user and fetched from E. Atlas) + // removing E-PROT- accessions because they are not supported in subsequent steps + // removing excluded accessions + ch_all_accessions = ch_input_accessions + .mix( ch_fetched_public_accessions ) + .unique() + .map { acc -> acc.trim() } + + emit: + accessions = ch_all_accessions + +} diff --git a/subworkflows/local/get_transcript_lengths/main.nf b/subworkflows/local/get_transcript_lengths/main.nf new file mode 100644 index 00000000..df54de83 --- /dev/null +++ b/subworkflows/local/get_transcript_lengths/main.nf @@ -0,0 +1,35 @@ +include { COMPUTE_GENE_TRANSCRIPT_LENGTHS } from '../../../modules/local/compute_gene_transcript_lengths' +include { DOWNLOAD_ENSEMBL_ANNOTATION } from '../../../modules/local/download_ensembl_annotation' + + +/* +======================================================================================== + SUBWORKFLOW TO GET TRANSCRIPT LENGTHS FROM GENOME ANNOTATION +======================================================================================== +*/ + +workflow GET_TRANSCRIPT_LENGTHS { + + take: + species + gff_file + + main: + + if ( gff_file ) { + ch_annotation = channel.fromPath( gff_file, checkIfExists: true ) + } else { + DOWNLOAD_ENSEMBL_ANNOTATION( species ) + ch_annotation = DOWNLOAD_ENSEMBL_ANNOTATION.out.gff3 + } + + COMPUTE_GENE_TRANSCRIPT_LENGTHS( ch_annotation ) + + + + emit: + csv = COMPUTE_GENE_TRANSCRIPT_LENGTHS.out.csv + + + +} diff --git a/subworkflows/local/idmapping/main.nf b/subworkflows/local/idmapping/main.nf new file mode 100644 index 00000000..5bf16957 --- /dev/null +++ b/subworkflows/local/idmapping/main.nf @@ -0,0 +1,171 @@ +include { CLEAN_GENE_IDS } from '../../../modules/local/clean_gene_ids' +include { EXTRACT_GENE_IDS } from '../../../modules/local/extract_gene_ids' +include { COLLECT_ALL_GENE_IDS } from '../../../modules/local/collect_all_gene_ids' +include { GPROFILER_IDMAPPING } from '../../../modules/local/gprofiler/idmapping' +include { DETECT_RARE_GENES } from '../../../modules/local/detect_rare_genes' +include { FILTER_AND_RENAME_GENES } from '../../../modules/local/filter_and_rename_genes' + +/* +======================================================================================== + SUBWORKFLOW TO MAP GENE IDS TO COMMON IDS AMONG ALL DATASETS AND TO REMOVE RARE GENES +======================================================================================== +*/ + +workflow ID_MAPPING { + + take: + ch_counts + species + skip_id_mapping + skip_cleaning_gene_ids + gprofiler_target_db + custom_gene_id_mapping + custom_gene_metadata + min_occurrence_freq + min_occurrence_quantile + outdir + + main: + + ch_gene_id_mapping = channel.empty() + ch_gene_metadata = channel.empty() + + + // ----------------------------------------------------------------- + // IN CASE OF ID MAPPING, CLEANING GENE IDS BEFOREHAND + // ----------------------------------------------------------------- + + if ( !skip_id_mapping && !skip_cleaning_gene_ids ) { + + // ensuring that all gene ids are valid before mapping + CLEAN_GENE_IDS ( ch_counts ) + ch_counts = CLEAN_GENE_IDS.out.counts + + } + + // ----------------------------------------------------------------- + // EXTRACTING GENE IDS FROM COUNTS FILE + // ----------------------------------------------------------------- + + EXTRACT_GENE_IDS ( ch_counts ) + ch_gene_ids = EXTRACT_GENE_IDS.out.gene_ids + + + + if ( skip_id_mapping ) { + + // ----------------------------------------------------------------- + // MAKING FILE CONTAINING ALL GENE UNIQUE GENE IDS (ALL GENE IDS ARE VALID) + // ----------------------------------------------------------------- + + ch_valid_gene_ids = ch_gene_ids + .splitText() + .map { it.trim() } + .unique() + .collectFile( + name: 'gene_ids.txt', + newLine: true, + storeDir: "${outdir}/idmapping/", + sort: true + ) + + } else { + + // ----------------------------------------------------------------- + // COLLECTING ALL CLEANED GENE IDS FROM ALL DATASETS + // ----------------------------------------------------------------- + + // sorting files in order to have a consistent input and be able to retry + COLLECT_ALL_GENE_IDS( + ch_gene_ids.toSortedList() + ) + + // ----------------------------------------------------------------- + // MAPPING THESE GENE IDS TO THE CHOSEN TARGET DB + // ----------------------------------------------------------------- + + GPROFILER_IDMAPPING( + COLLECT_ALL_GENE_IDS.out.unique_gene_ids, + species, + gprofiler_target_db + ) + ch_gene_id_mapping = GPROFILER_IDMAPPING.out.mapping + ch_gene_metadata = GPROFILER_IDMAPPING.out.metadata + + // ----------------------------------------------------------------- + // FILTERING OUT GENE IDS THAT DO NOT HAVE ENOUGH OCCURRENCES + // ----------------------------------------------------------------- + + DETECT_RARE_GENES( + ch_gene_id_mapping, + COLLECT_ALL_GENE_IDS.out.gene_id_occurrences, + ch_counts.count(), + min_occurrence_freq, + min_occurrence_quantile + ) + ch_valid_gene_ids = DETECT_RARE_GENES.out.valid_gene_ids + } + + // ----------------------------------------------------------------- + // COLLECTING GLOBAL GENE ID MAPPING AND METADATA + // ----------------------------------------------------------------- + + ch_global_gene_id_mapping = ch_gene_id_mapping + .mix( + custom_gene_id_mapping ? + channel.fromPath( custom_gene_id_mapping, checkIfExists: true ) : + channel.empty() + ) + .splitCsv( header: true ) + .unique() + .collectFile( + name: 'global_gene_id_mapping.csv', + seed: "original_gene_id,gene_id", + newLine: true, + storeDir: "${outdir}/idmapping/", + sort: true + ) { + item -> "${item["original_gene_id"]},${item["gene_id"]}" + } + + ch_global_gene_metadata = ch_gene_metadata + .mix( + custom_gene_metadata ? + channel.fromPath( custom_gene_metadata, checkIfExists: true ) : + channel.empty() + ) + .splitCsv( header: true ) + .unique() + .collectFile( + name: 'global_gene_metadata.csv', + seed: "gene_id,name,description", + newLine: true, + storeDir: "${outdir}/idmapping/", + sort: true + ) { + item -> "${item["gene_id"]},${item["name"]},${item["description"]}" + } + + // ----------------------------------------------------------------- + // RENAMING GENE IDS IN ALL COUNT DATASETS (ONLY IF NECESSARY) + // ----------------------------------------------------------------- + + if ( !skip_id_mapping || custom_gene_id_mapping ) { + + FILTER_AND_RENAME_GENES( + ch_counts, + ch_global_gene_id_mapping.first(), + ch_valid_gene_ids.collect() + ) + ch_counts = FILTER_AND_RENAME_GENES.out.counts + + } + + + emit: + counts = ch_counts + mapping = ch_global_gene_id_mapping + metadata = ch_global_gene_metadata + valid_gene_ids = ch_valid_gene_ids + +} diff --git a/subworkflows/local/merge_data/main.nf b/subworkflows/local/merge_data/main.nf new file mode 100644 index 00000000..55b6cc52 --- /dev/null +++ b/subworkflows/local/merge_data/main.nf @@ -0,0 +1,97 @@ +include { MERGE_COUNTS as PLATFORM } from '../../../modules/local/merge_counts' +include { MERGE_COUNTS as GLOBAL } from '../../../modules/local/merge_counts' +include { IMPUTE_MISSING_VALUES } from '../../../modules/local/impute_missing_values' + +/* +======================================================================================== + SUBWORKFLOW TO DOWNLOAD EXPRESSIONATLAS ACCESSIONS AND DATASETS +======================================================================================== +*/ + +workflow MERGE_DATA { + + take: + ch_normalised_counts + missing_value_imputer + outdir + + main: + + // ----------------------------------------------------------------- + // MERGE COUNTS FOR EACH PLATFORM SEPARATELY + // ----------------------------------------------------------------- + + + ch_normalised_rnaseq_counts = ch_normalised_counts.filter { meta, file -> meta.platform == "rnaseq" } + ch_normalised_microarray_counts = ch_normalised_counts.filter { meta, file -> meta.platform == "microarray" } + + ch_collected_rnaseq_counts = ch_normalised_rnaseq_counts + .map { meta, file -> file } + .collect( sort: true ) + .map { files -> [ [ platform: "rnaseq" ], files ] } + + ch_collected_microarray_counts = ch_normalised_microarray_counts + .map { meta, file -> file } + .collect( sort: true ) + .map { files -> [ [ platform: "microarray" ], files ] } + + PLATFORM ( + ch_collected_rnaseq_counts.concat( ch_collected_microarray_counts ) + ) + + ch_platform_counts = PLATFORM.out.counts + + // ----------------------------------------------------------------- + // MERGE ALL COUNTS + // ----------------------------------------------------------------- + + ch_collected_merged_counts = ch_platform_counts + .map { meta, file -> file } + .collect( sort: true ) + .map { files -> [ [ platform: "all" ], files ] } + + GLOBAL( ch_collected_merged_counts ) + ch_all_counts = GLOBAL.out.counts + + // ----------------------------------------------------------------- + // IMPUTE MISSING VALUES + // ----------------------------------------------------------------- + + IMPUTE_MISSING_VALUES( + ch_all_counts.collect(), + missing_value_imputer + ) + + // ----------------------------------------------------------------- + // MERGE ALL DESIGNS IN A SINGLE TABLE + // ----------------------------------------------------------------- + + ch_whole_design = ch_normalised_counts + .map { + meta, file -> // extracts design file and adds batch column whenever missing (for custom datasets) + def design_content = meta.design.splitCsv( header: true ) + // if there is no batch, it is custom data + def updated_design_content = design_content.collect { row -> + row.batch = row.batch ?: "custom_${meta.dataset}" + return row + } + [ updated_design_content ] + } + .flatten() + .unique() + .collectFile( + name: 'whole_design.csv', + seed: "batch,condition,sample", + newLine: true, + sort: true, + storeDir: "${outdir}/merged_datasets/" + ) { + item -> "${item.batch},${item.condition},${item.sample}" + } + + emit: + all_imputed_counts = IMPUTE_MISSING_VALUES.out.counts + all_counts = ch_all_counts + platform_counts = ch_platform_counts + whole_design = ch_whole_design +} diff --git a/subworkflows/local/reporting/main.nf b/subworkflows/local/reporting/main.nf new file mode 100644 index 00000000..9d4fa676 --- /dev/null +++ b/subworkflows/local/reporting/main.nf @@ -0,0 +1,400 @@ +include { AGGREGATE_RESULTS } from '../../../modules/local/aggregate_results' +include { DASH_APP } from '../../../modules/local/dash_app' +include { COLLECT_STATISTICS } from '../../../modules/local/collect_statistics' +include { MULTIQC } from '../../../modules/nf-core/multiqc' + + +include { methodsDescriptionText } from '../utils_nfcore_stableexpression_pipeline' +include { paramsSummaryMultiqc } from '../../nf-core/utils_nfcore_pipeline' +include { softwareVersionsToYAML } from '../../nf-core/utils_nfcore_pipeline' +include { paramsSummaryMap } from 'plugin/nf-schema' + +/* +======================================================================================== + SUBWORKFLOW DEDICATED TO AGGREGATION OF RESULTS AND REPORTING (DASH APP AND MULTIQC) +======================================================================================== +*/ + +workflow REPORTING { + + take: + ch_all_counts + ch_whole_design + ch_stats_all_genes_with_scores + ch_platform_statistics + ch_whole_gene_metadata + ch_whole_gene_id_mapping + target_genes + target_gene_file + multiqc_config + multiqc_logo + multiqc_methods_description + outdir + + main: + + ch_versions = channel.empty() + + // ----------------------------------------------------------------- + // AGGREGATE ALL RESULTS FOR MULTIQC + // ----------------------------------------------------------------- + + ch_target_gene_file = target_gene_file ? channel.fromPath( target_gene_file, checkIfExists: true ) : channel.empty() + + ch_target_gene_list = channel.fromList( target_genes.tokenize(',') ) + .mix( ch_target_gene_file.splitText() ) + .map { it.trim() } + .filter { it != "" } + .unique() + .toSortedList() + + ch_custom_content_multiqc_config_template = channel.fromPath( + "${projectDir}/assets/multiqc_config.custom_content.template.yaml", + checkIfExists: true + ) + + AGGREGATE_RESULTS ( + ch_all_counts.map{ meta, file -> file }.collect(), + ch_stats_all_genes_with_scores.collect(), + ch_platform_statistics.collect(), + ch_target_gene_list, + ch_whole_gene_metadata.collect().ifEmpty([]), // handle case where there are no mappings + ch_whole_gene_id_mapping.collect().ifEmpty([]), // handle case where there are no mappings + ch_custom_content_multiqc_config_template.collect() + ) + + ch_all_genes_summary = AGGREGATE_RESULTS.out.all_genes_summary + ch_most_stable_genes_summary = AGGREGATE_RESULTS.out.most_stable_genes_summary + ch_most_stable_genes_transposed_counts = AGGREGATE_RESULTS.out.most_stable_genes_transposed_counts_filtered + ch_custom_content_multiqc_config = AGGREGATE_RESULTS.out.custom_content_multiqc_config + + // ----------------------------------------------------------------- + // DASH APPLICATION + // ----------------------------------------------------------------- + + DASH_APP( + ch_all_counts.map{ meta, file -> file }.collect(), + ch_whole_design.collect(), + ch_all_genes_summary.collect() + ) + ch_versions = ch_versions.mix ( DASH_APP.out.versions ) + + + // ------------------------------------------------------------------------------------ + // PREPARING BAR PLOTS + // ------------------------------------------------------------------------------------ + + ch_id_mapping_stats = channel.topic('mqc_id_mapping_stats') + .collectFile( + name: 'id_mapping_stats.csv', + seed: "dataset,final,merged,not_valid,unmapped", + newLine: true, + storeDir: "${outdir}/statistics/" + ) { + item -> "${item[0]},${item[1]},${item[2]},${item[3]},${item[4]}" + } + + ch_missing_values_filter_stats = channel.topic('mqc_missing_values_filter_stats') + .collectFile( + name: 'missing_values_filter_stats.csv', + seed: "dataset,kept,rejected", + newLine: true, + storeDir: "${outdir}/statistics/" + ) { + item -> "${item[0]},${item[1]},${item[2]}" + } + + ch_zero_values_filter_stats = channel.topic('mqc_zero_values_filter_stats') + .collectFile( + name: 'zero_values_filter_stats.csv', + seed: "dataset,kept,rejected", + newLine: true, + storeDir: "${outdir}/statistics/" + ) { + item -> "${item[0]},${item[1]},${item[2]}" + } + + // ------------------------------------------------------------------------------------ + // PREPARING BOX PLOTS + // ------------------------------------------------------------------------------------ + + ch_skewness = channel.topic('skewness') + .map { dataset, file -> "${dataset},${file.readLines()[0]}" } // concatenate dataset name with skewness values + .collectFile( + name: 'skewness.csv', + newLine: true, + sort: true, + storeDir: "${outdir}/statistics/" + ) + + + ch_ratio_zeros = channel.topic('ratio_zeros') + .map { dataset, file -> "${dataset},${file.readLines()[0]}" } // concatenate dataset name with ratio values + .collectFile( + name: 'ratio_zeros.csv', + newLine: true, + sort: true, + storeDir: "${outdir}/statistics/" + ) + + ch_ratio_nulls = channel.topic('ratio_nulls') + .map { dataset, file -> "${dataset},${file.readLines()[0]}" } // concatenate dataset name with ratio values + .collectFile( + name: 'ratio_nulls.csv', + newLine: true, + sort: true, + storeDir: "${outdir}/statistics/" + ) + + ch_stat_files = ch_skewness + .mix( ch_ratio_nulls ) + .mix( ch_ratio_zeros ) + + COLLECT_STATISTICS( ch_stat_files ) + + // ------------------------------------------------------------------------------------ + // FAILURE / WARNING REPORTS + // ------------------------------------------------------------------------------------ + + ch_eatlas_failure_reasons = channel.topic('eatlas_failure_reason') + .map { accession, file -> [ accession, file.readLines()[0] ] } + .collectFile( + name: 'eatlas_failure_reasons.csv', + seed: "Accession,Reason", + newLine: true, + sort: true, + storeDir: "${outdir}/errors/", + ) { + item -> "${item[0]},${item[1]}" + } + + ch_eatlas_warning_reasons = channel.topic('eatlas_warning_reason') + .map { accession, file -> [ accession, file.readLines()[0] ] } + .collectFile( + name: 'eatlas_warning_reasons.csv', + seed: "Accession,Reason", + newLine: true, + sort: true, + storeDir: "${outdir}/warnings/" + ) { + item -> "${item[0]},${item[1]}" + } + + ch_geo_failure_reasons = channel.topic('geo_failure_reason') + .map { accession, file -> [ accession, file.readLines()[0] ] } + .collectFile( + name: 'geo_failure_reasons.csv', + seed: "Accession,Reason", + newLine: true, + sort: true, + storeDir: "${outdir}/errors/" + ) { + item -> "${item[0]},${item[1]}" + } + + + ch_geo_warning_reasons = channel.topic('geo_warning_reason') + .map { accession, file -> [ accession, file.readLines()[0] ] } + .collectFile( + name: 'geo_warning_reasons.csv', + seed: "Accession,Reason", + newLine: true, + sort: true, + storeDir: "${outdir}/warnings/" + ) { + item -> "${item[0]},${item[1]}" + } + + ch_id_cleaning_failure_reasons = channel.topic('id_cleaning_failure_reason') + .map { dataset, file -> [ dataset, file.readLines()[0] ] } + .collectFile( + name: 'id_cleaning_failure_reasons.tsv', + seed: "Dataset\tReason", + newLine: true, + sort: true, + storeDir: "${outdir}/errors/" + ) { + item -> "${item[0]}\t${item[1]}" + } + + ch_id_mapping_warning_reasons = channel.topic('renaming_warning_reason') + .map { dataset, file -> [ dataset, file.readLines()[0] ] } + .collectFile( + name: 'renaming_warning_reasons.tsv', + seed: "Dataset\tReason", + newLine: true, + sort: true, + storeDir: "${outdir}/warnings/" + ) { + item -> "${item[0]}\t${item[1]}" + } + + ch_id_mapping_failure_reasons = channel.topic('renaming_failure_reason') + .map { dataset, file -> [ dataset, file.readLines()[0] ] } + .collectFile( + name: 'renaming_failure_reasons.tsv', + seed: "Dataset\tReason", + newLine: true, + sort: true, + storeDir: "${outdir}/errors/" + ) { + item -> "${item[0]}\t${item[1]}" + } + + ch_normalisation_warning_reasons = channel.topic('normalisation_warning_reason') + .map { dataset, file -> [ dataset, file.readLines()[0] ] } + .collectFile( + name: 'normalisation_warning_reasons.tsv', + seed: "Dataset\tReason", + newLine: true, + sort: true, + storeDir: "${outdir}/warnings/" + ) { + item -> "${item[0]}\t${item[1]}" + } + + ch_normalisation_failure_reasons = channel.topic('normalisation_failure_reason') + .map { dataset, file -> [ dataset, file.readLines()[0] ] } + .collectFile( + name: 'normalisation_failure_reasons.tsv', + seed: "Dataset\tReason", + newLine: true, + sort: true, + storeDir: "${outdir}/errors/" + ) { + item -> "${item[0]}\t${item[1]}" + } + + + // ------------------------------------------------------------------------------------ + // MULTIQC FILES + // ------------------------------------------------------------------------------------ + + ch_multiqc_files = channel.empty() + .mix( ch_most_stable_genes_summary.collect() ) // single item + .mix( ch_all_genes_summary.collect() ) // single item + .mix( ch_most_stable_genes_transposed_counts.collect() ) // single item + .mix( channel.topic('eatlas_all_datasets').toSortedList() ) + .mix( channel.topic('eatlas_selected_datasets').toSortedList() ) + .mix( channel.topic('geo_all_datasets').toSortedList() ) + .mix( channel.topic('geo_selected_datasets').toSortedList() ) + .mix( channel.topic('geo_rejected_datasets').toSortedList() ) + .mix( channel.topic('total_gene_id_occurrence_quantiles').toSortedList() ) + .mix( COLLECT_STATISTICS.out.csv ) + .mix( ch_id_mapping_stats ) + .mix( ch_missing_values_filter_stats ) + .mix( ch_zero_values_filter_stats ) + .mix( ch_eatlas_failure_reasons ) + .mix( ch_eatlas_warning_reasons ) + .mix( ch_geo_failure_reasons ) + .mix( ch_geo_warning_reasons ) + .mix( ch_id_cleaning_failure_reasons ) + .mix( ch_id_mapping_warning_reasons ) + .mix( ch_id_mapping_failure_reasons ) + .mix( ch_normalisation_failure_reasons ) + .mix( ch_normalisation_warning_reasons ) + + + // ------------------------------------------------------------------------------------ + // VERSIONS + // ------------------------------------------------------------------------------------ + + // Collate and save software versions + // + def topic_versions = channel.topic("versions") + .distinct() + .branch { entry -> + versions_file: entry instanceof Path + versions_tuple: true + } + + def topic_versions_string = topic_versions.versions_tuple + .map { process, tool, version -> + [ process[process.lastIndexOf(':')+1..-1], " ${tool}: ${version}" ] + } + .groupTuple(by:0) + .map { process, tool_versions -> + tool_versions.unique().sort() + "${process}:\n${tool_versions.join('\n')}" + } + + ch_collated_versions = softwareVersionsToYAML(ch_versions.mix(topic_versions.versions_file)) + .mix(topic_versions_string) + .collectFile( + storeDir: "${outdir}/pipeline_info", + name: 'nf_core_' + 'stableexpression_software_' + 'mqc_' + 'versions.yml', + sort: true, + newLine: true + ) + + // ------------------------------------------------------------------------------------ + // PREPARE MULTIQC INPUT + // ------------------------------------------------------------------------------------ + + ch_multiqc_config = channel.fromPath( + "$projectDir/assets/multiqc_config.yml", checkIfExists: true) + + ch_multiqc_custom_config = multiqc_config ? + channel.fromPath(multiqc_config, checkIfExists: true) : + channel.empty() + + ch_multiqc_logo = multiqc_logo ? + channel.fromPath(multiqc_logo, checkIfExists: true) : + channel.of([]) + + summary_params = paramsSummaryMap( + workflow, + parameters_schema: "nextflow_schema.json" + ) + ch_workflow_summary = channel.value(paramsSummaryMultiqc(summary_params)) + + ch_multiqc_files = ch_multiqc_files + .mix( ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml') ) + + ch_multiqc_custom_methods_description = multiqc_methods_description ? + file(multiqc_methods_description, checkIfExists: true) : + file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + + ch_methods_description = channel.value( + methodsDescriptionText(ch_multiqc_custom_methods_description) + ) + + // ------------------------------------------------------------------------------------ + // ADDING KEY TO JOIN ON + // ------------------------------------------------------------------------------------ + + ch_multiqc_file_list = ch_multiqc_files + .mix( ch_collated_versions ) + .mix( + ch_methods_description.collectFile( + name: 'methods_description_mqc.yaml', + sort: true + ) + ) + .flatten() + .toSortedList() + .map{ list -> [ [id: 'Final report'], list ] } + + ch_multiqc_config_list = ch_multiqc_config + .mix( ch_multiqc_custom_config ) + .mix( ch_custom_content_multiqc_config ) + .toSortedList() + .map{ list -> [ [id: 'Final report'], list ] } + + ch_multiqc_logo = ch_multiqc_logo.map{ file -> [ [id: 'Final report'], file ] } + + // ------------------------------------------------------------------------------------ + // MULTIQC + // ------------------------------------------------------------------------------------ + + ch_multiqc_input = ch_multiqc_file_list + .join( ch_multiqc_config_list ) + .join( ch_multiqc_logo ) + .map { meta, files, configs, logo -> [ meta, files, configs, logo , [], [] ] } + + MULTIQC ( ch_multiqc_input ) + + emit: + multiqc_report = MULTIQC.out.report + all_genes_summary = ch_all_genes_summary +} diff --git a/subworkflows/local/sample_filtering/main.nf b/subworkflows/local/sample_filtering/main.nf new file mode 100644 index 00000000..31006f3f --- /dev/null +++ b/subworkflows/local/sample_filtering/main.nf @@ -0,0 +1,62 @@ +include { FILTER_OUT_SAMPLES_WITH_TOO_MANY_ZEROS as TOO_MANY_ZEROS } from '../../../modules/local/filter_out_samples/with_too_many_zeros' +include { FILTER_OUT_SAMPLES_WITH_TOO_MANY_MISSING_VALUES as TOO_MANY_MISSING_VALUES } from '../../../modules/local/filter_out_samples/with_too_many_missing_values' + + +/* +======================================================================================== + SUBWORKFLOW TO FILTER OUT UNVALID SAMPLES AND EMIT STATISTICS ABOUT ZEROS / MISSING VALUES +======================================================================================== +*/ + +workflow SAMPLE_FILTERING { + + take: + ch_counts + ch_valid_gene_ids + max_zero_ratio + max_null_ratio + outdir + + main: + + // ----------------------------------------------------------------- + // REMOVE SAMPLES WITH TOO MANY ZEROS + // ----------------------------------------------------------------- + + TOO_MANY_ZEROS ( + ch_counts, + max_zero_ratio + ) + + // ----------------------------------------------------------------- + // REMOVE SAMPLES WITH TOO MANY MISSING VALUES + // ----------------------------------------------------------------- + + TOO_MANY_MISSING_VALUES( + TOO_MANY_ZEROS.out.counts, + ch_valid_gene_ids.collect(), + max_null_ratio + ) + + // ----------------------------------------------------------------- + // GET NUMBER OF NULLS PER SAMPLE + // ----------------------------------------------------------------- + + ch_ratio_nulls_per_sample_file = TOO_MANY_MISSING_VALUES.out.ratio_nulls_per_sample + .splitCsv( header: true ) + .collectFile( + name: 'ratio_nulls_per_sample.csv', + seed: "sample,ratio", + newLine: true, + storeDir: "${outdir}/statistics/", + sort: true + ) + { + item -> "${item["sample"]},${item["ratio"]}" + } + + emit: + counts = TOO_MANY_MISSING_VALUES.out.counts + ratio_nulls_per_sample_file = ch_ratio_nulls_per_sample_file + +} diff --git a/subworkflows/local/stability_scoring/main.nf b/subworkflows/local/stability_scoring/main.nf new file mode 100644 index 00000000..d6a2de39 --- /dev/null +++ b/subworkflows/local/stability_scoring/main.nf @@ -0,0 +1,87 @@ +include { GET_CANDIDATE_GENES } from '../../../modules/local/get_candidate_genes' +include { NORMFINDER } from '../../../modules/local/normfinder' +include { COMPUTE_STABILITY_SCORES } from '../../../modules/local/compute_stability_scores' + +include { GENORM } from '../genorm' + +/* +======================================================================================== + SUBWORKFLOW TO COMPUTE STABILITY SCORES +======================================================================================== +*/ + +workflow STABILITY_SCORING { + + take: + ch_counts + ch_design + ch_stats + nb_candidates_per_section + nb_sections + skip_genorm + stability_score_weights + + main: + + // ----------------------------------------------------------------- + // GETTING CANDIDATE GENES + // ----------------------------------------------------------------- + + GET_CANDIDATE_GENES( + ch_counts.collect(), // single item + ch_stats.collect(), // single item + nb_candidates_per_section, + nb_sections + ) + + ch_candidate_gene_counts = splitBySection( GET_CANDIDATE_GENES.out.counts ) + ch_section_stats = splitBySection( GET_CANDIDATE_GENES.out.section_stats ) + + // ----------------------------------------------------------------- + // NORMFINDER + // ----------------------------------------------------------------- + + NORMFINDER ( + ch_candidate_gene_counts, + ch_design.collect() // single item + ) + ch_normfinder_stabilities = NORMFINDER.out.stability_values + + // ----------------------------------------------------------------- + // GENORM + // ----------------------------------------------------------------- + + if ( !skip_genorm ) { + GENORM ( ch_candidate_gene_counts ) + ch_genorm_stability = GENORM.out.m_measures + } else { + ch_genorm_stability = channel.value([:]) + } + + // ----------------------------------------------------------------- + // AGGREGATION AND FINAL STABILITY SCORE + // ----------------------------------------------------------------- + + COMPUTE_STABILITY_SCORES ( + ch_normfinder_stabilities.join( ch_genorm_stability ).join( ch_section_stats ), + stability_score_weights + ) + + emit: + summary_statistics = COMPUTE_STABILITY_SCORES.out.stats_with_stability_scores + +} + +/* +======================================================================================== + FUNCTIONS +======================================================================================== +*/ + +def splitBySection( ch_files ) { + return ch_files + .map { files -> + files.collect { file -> [ [ section: file.name.tokenize(".")[0] ], file ] } + } + .flatMap{ n -> n } // turns a channel of one list of n files into a channel of n files +} diff --git a/subworkflows/local/utils_nfcore_stableexpression_pipeline/main.nf b/subworkflows/local/utils_nfcore_stableexpression_pipeline/main.nf index 0c910a6b..ed736dd6 100644 --- a/subworkflows/local/utils_nfcore_stableexpression_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_stableexpression_pipeline/main.nf @@ -11,7 +11,6 @@ include { UTILS_NFSCHEMA_PLUGIN } from '../../nf-core/utils_nfschema_plugin' include { paramsSummaryMap } from 'plugin/nf-schema' include { samplesheetToList } from 'plugin/nf-schema' -include { paramsHelp } from 'plugin/nf-schema' include { completionEmail } from '../../nf-core/utils_nfcore_pipeline' include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' include { imNotification } from '../../nf-core/utils_nfcore_pipeline' @@ -39,8 +38,6 @@ workflow PIPELINE_INITIALISATION { main: - ch_versions = channel.empty() - // // Print version and exit if required and dump pipeline parameters to JSON file // @@ -71,7 +68,7 @@ workflow PIPELINE_INITIALISATION { * Software dependencies https://github.com/nf-core/stableexpression/blob/main/CITATIONS.md """ - command = "nextflow run ${workflow.manifest.name} -profile --input samplesheet.csv --outdir " + command = "nextflow run ${workflow.manifest.name} -profile --species --outdir " UTILS_NFSCHEMA_PLUGIN ( workflow, @@ -93,32 +90,23 @@ workflow PIPELINE_INITIALISATION { ) // - // Create channel from input file provided through params.input + // Custom validation for pipeline parameters // + validateInputParameters( params ) - channel - .fromList(samplesheetToList(params.input, "${projectDir}/assets/schema_input.json")) - .map { - meta, fastq_1, fastq_2 -> - if (!fastq_2) { - return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ] - } else { - return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ] - } - } - .groupTuple() - .map { samplesheet -> - validateInputSamplesheet(samplesheet) - } - .map { - meta, fastqs -> - return [ meta, fastqs.flatten() ] - } - .set { ch_samplesheet } + // + // Create channel from datasets file provided through params.datasets + // + if (params.datasets) { + ch_input_datasets = parseInputDatasets( params.datasets ) + validateInputSamplesheet( ch_input_datasets ) + } else { + ch_input_datasets = channel.empty() + } emit: - samplesheet = ch_samplesheet - versions = ch_versions + input_datasets = ch_input_datasets + } /* @@ -174,20 +162,102 @@ workflow PIPELINE_COMPLETION { FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +// +// Check and validate pipeline parameters +// + + +def check_accession(accession) { + if ( !( accession.startsWith('E-') || accession.startsWith('GSE') ) ) { + error('Accession ' + accession + ' is not well formated. All accessions should start with "E-" or "GSE".') + } +} + + +def check_accession_string(accessions_str) { + if ( accessions_str != null && accessions_str != "" ) { + accessions_str.tokenize(',').each { accession -> + check_accession(accession) + } + } +} + +def check_accession_file(accession_file) { + if ( accession_file != null ) { + def lines = new File(accession_file).readLines() + lines.each { accession -> + check_accession(accession) + } + } +} + +def validateInputParameters(params) { + + // checking that a species has been provided + if ( !params.species ) { + error('You must provide a species name') + } + + // if accessions are provided or excluded, checking that they are well formated + check_accession_string( params.accessions ) + check_accession_string( params.excluded_accessions ) + + check_accession_file( params.accessions_file ) + check_accession_file( params.excluded_accessions_file ) + + if ( params.keywords && params.skip_fetch_eatlas_accessions && !params.fetch_geo_accessions ) { + log.warn "Ignoring keywords as accessions will not be fetched from Expression Atlas or GEO" + } + +} + +// +// Parses files from input dataset and creates two subchannels raw and normalized +// with elements like [meta, count_file, normalised] +def parseInputDatasets(samplesheet) { + return channel.fromList( samplesheetToList(samplesheet, "assets/schema_datasets.json") ) + .map { + item -> + def (meta, count_file) = item + def new_meta = meta + [dataset: count_file.getBaseName()] + [new_meta, count_file] + } +} + // // Validate channels from input samplesheet // -def validateInputSamplesheet(input) { - def (metas, fastqs) = input[1..2] +def validateInputSamplesheet( ch_datasets ) { + // checking that all microarray datasets (if any) are normalised + ch_datasets + .filter { + meta, file -> + meta.platform == 'microarray' && !meta.normalised + } + .count() + .map { count -> + if (count > 0) { + def error_text = [ + "Error: You provided at least one microarray dataset that is not normalised. ", + "Microarray datasets must already be normalised before being submitted. ", + "Please perform normalisation (typically using RMA for one-colour intensities / LOESS (limma) for two-colour intensities) and run again." + ].join(' ').trim() + error(error_text) + } + } - // Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end - def endedness_ok = metas.collect{ meta -> meta.single_end }.unique().size == 1 - if (!endedness_ok) { - error("Please check input samplesheet -> Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end: ${metas[0].id}") - } + // checking that all count files are well formated (same number of columns in header and rows) + ch_datasets + .map { meta, file -> + def header = file.withReader { reader -> reader.readLine() } + def separator = header.contains(',') ? "," : + header.contains('\t') ? "\t" : + " " + def first_row = file.splitCsv( header: false, skip: 1, limit: 1, sep: separator ) - return [ metas[0], fastqs ] + assert header.split(separator).size() == first_row[0].size() : "Header and first row do not have the same number of columns in file ${file}" + } } // // Generate methods description for MultiQC @@ -252,3 +322,110 @@ def methodsDescriptionText(mqc_methods_yaml) { return description_html.toString() } + + +/* +======================================================================================== + FUNCTIONS FOR FORMATTING DATA FETCHED FROM EXPRESSION ATLAS / GEO +======================================================================================== +*/ + +// +// Get Expression Atlas Batch ID (accession + data_type) from file stem +// +def addDatasetIdToMetadata( ch_files ) { + return ch_files + .map { + file -> + def meta = [ dataset: file.getSimpleName() ] + [meta, file] + } +} + +// +// Groups design and data files by accession and data_type +// Design and count files have necessarily the same dataset ID (same file stem) +// +def groupFilesByDatasetId(ch_design, ch_counts) { + return ch_design + .concat( ch_counts ) // puts counts at the end of the resulting channel + .groupTuple() // groups by dataset ID; design files are necessarily BEFORE count files + .filter { + it.get(1).size() == 2 // only groups with two files + } + .filter { // only groups with first file as design file and second one as count fileWARN: java.net.ConnectException: Connexion refusée + meta, files -> + files.get(0).name.endsWith('.design.csv') && !files.get(1).name.endsWith('.design.csv') + } + .map { // putting design file in meta + meta, files -> + def new_meta = meta + [design: files[0]] + [new_meta, files[1]] + } +} + +def getNthPartFromEnd(String s, int n) { + def tokens = s.tokenize('.') + return tokens[tokens.size() - n] +} + +// +// Add normalised: true / false in meta +// +def augmentMetadata( ch_files ) { + return ch_files + .map { + meta, file -> + def norm_state = getNthPartFromEnd(file.name, 3) + def normalised = false + if ( norm_state == 'normalised' ) { + normalised = true + } else if ( norm_state == 'raw' ) { + normalised = false + } else { + error("Invalid normalisation state: ${norm_state}") + } + + def platform = getNthPartFromEnd(file.name, 4) + def new_meta = meta + [normalised: normalised, platform: platform] + [new_meta, file] + } +} + + +/* +======================================================================================== + FUNCTIONS FOR CHECKING NB OF DATASETS +======================================================================================== +*/ + +def checkCounts(ch_counts, fetch_geo_accessions) { + + ch_counts.count().map { n -> + if( n == 0 ) { + // display a warning if no datasets are found + def msg_lst = [] + if ( !fetch_geo_accessions ) { + msg_lst = [ + "Could not find any readily usable public dataset...", + "This might be due to connection issues on the Expression Atlas FTP server.", + "If it is the case, please wait for a couple of minutes and run again.", + "Alternatively, datasets for your species of interest might not exist on Expression Atlas.", + "In this case, you can try to get additional datasets from NCBI GEO Datasets using the --fetch_geo_accessions flag (this feature is still experimental)." + ] + } else { + msg_lst = [ + "Could not find any readily usable public dataset...", + "This might be due to connection issues on the Expression Atlas FTP server.", + "If it is the case, please wait for a couple of minutes and run again.", + "You can check directly on NCBI GEO Datasets if there are available datasets for this species that you can prepare yourself:", + "https://www.ncbi.nlm.nih.gov/gds", + "Once you have prepared your own data, you can relaunch the pipeline and provide your prepared count datasets using the --datasets parameter. ", + "For more information, see the online documentation at https://nf-co.re/stableexpression." + ] + } + def msg = msg_lst.join("\n").trim() + error(msg) + } + } +} diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf index 2f30e9a4..bfd25876 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf @@ -98,7 +98,7 @@ def workflowVersionToYAML() { // Get channel of software versions used in pipeline in YAML format // def softwareVersionsToYAML(ch_versions) { - return ch_versions.unique().map { version -> processVersionsFromYAML(version) }.unique().mix(channel.of(workflowVersionToYAML())) + return ch_versions.unique().map { version -> processVersionsFromYAML(version) }.unique().mix(Channel.of(workflowVersionToYAML())) } // diff --git a/subworkflows/nf-core/utils_nfschema_plugin/main.nf b/subworkflows/nf-core/utils_nfschema_plugin/main.nf index ee4738c8..acb39724 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/main.nf +++ b/subworkflows/nf-core/utils_nfschema_plugin/main.nf @@ -71,4 +71,3 @@ workflow UTILS_NFSCHEMA_PLUGIN { emit: dummy_emit = true } - diff --git a/tests/.nftignore b/tests/.nftignore index 83f7a0a5..6b84cdef 100644 --- a/tests/.nftignore +++ b/tests/.nftignore @@ -8,3 +8,9 @@ multiqc/multiqc_data/llms-full.txt multiqc/multiqc_plots/{svg,pdf,png}/*.{svg,pdf,png} multiqc/multiqc_report.html pipeline_info/*.{html,json,txt,yml} +**.py +**.parquet +**multiqc_geo*metadata.txt +**geo*metadata.tsv +**skewness.csv +**skewness.txt diff --git a/tests/act/README.md b/tests/act/README.md new file mode 100644 index 00000000..3c890a45 --- /dev/null +++ b/tests/act/README.md @@ -0,0 +1,47 @@ +# Mimic runs of nf-test in Github runners using act + +This folder contains all the necessary files to run `nf-test` tests using [act](https://nektosact.com/introduction.html). + +## Install act + +To install `act`, simply run: + +``` +curl --proto '=https' --tlsv1.2 -sSf https://raw.githubusercontent.com/nektos/act/master/install.sh | sudo bash +``` + +> [!NOTE] +> You might then have to place the act binary in a folder in your `$PATH`. + +> [!IMPORTANT] +> `act` used `docker` under the hood. To install `docker`, see the [installation instructions](https://docs.docker.com/engine/install/). + +## Setup tests to run + +The `params.env` comprises all the necessary configuration to run the tests you need: + +- profile(s) +- Nextflow version + +## Run tests + +You need to specify in `params.env` the profile(s) that will be used. All the other nf-test arguments must be provided as usual. + +Example: + +```.env +#params.env +NXF_VER=25.04.0 +PROFILE=conda +``` + +``` +# from the root folder of you repo +tests/act/run --tag --debug --verbose +``` + +## Clean generated files + +``` +sudo rm -rf .nf-test +``` diff --git a/tests/act/actions/nf-test/action.yml b/tests/act/actions/nf-test/action.yml new file mode 100644 index 00000000..1b0c638e --- /dev/null +++ b/tests/act/actions/nf-test/action.yml @@ -0,0 +1,85 @@ +name: "nf-test Action" +description: "Runs nf-test with common setup steps" +inputs: + profile: + description: "Profile(s) to use for nf-test" + required: true + args: + description: "Arguments to pass to nf-test" + required: true +runs: + using: "composite" + steps: + - name: Install Node.js + uses: actions/setup-node@v6 + with: + node-version: 22 + + - name: Setup Nextflow + uses: nf-core/setup-nextflow@v2 + with: + version: "${{ env.NXF_VERSION }}" + + - name: Set up Python + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 + with: + python-version: "3.14" + + - name: Install nf-test + run: | + wget -qO- https://get.nf-test.com | bash + mv nf-test /usr/local/bin + + - uses: actions/cache@v5 + with: + path: /var/cache/apt/archives + key: apt-deps-${{ runner.os }} + + - name: Install apptainer dependencies + if: contains(inputs.profile, 'apptainer') + shell: bash + run: | + apt update + apt install -y libfuse3-3 uidmap fakeroot + + - name: Setup apptainer + if: contains(inputs.profile, 'apptainer') + uses: eWaterCycle/setup-apptainer@v2 + with: + apptainer-version: 1.4.5 + + - name: Set up Singularity + if: contains(inputs.profile, 'singularity') + shell: bash + run: | + mkdir -p $NXF_SINGULARITY_CACHEDIR + mkdir -p $NXF_SINGULARITY_LIBRARYDIR + + - name: Conda setup + if: contains(inputs.profile, 'conda') + uses: conda-incubator/setup-miniconda@505e6394dae86d6a5c7fbb6e3fb8938e3e863830 # v3 + with: + auto-update-conda: true + conda-solver: libmamba + channels: conda-forge + channel-priority: strict + conda-remove-defaults: true + + - name: Run nf-test + shell: bash + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + run: | + nf-test test \ + --profile ${{ inputs.profile }} \ + ${{ inputs.args }} + + - name: Upload nf-test artifacts + if: failure() + uses: actions/upload-artifact@v4 + with: + name: nf-test-artifacts + path: .nf-test/tests + include-hidden-files: true + overwrite: true + compression-level: 0 diff --git a/tests/act/nf-test.yml b/tests/act/nf-test.yml new file mode 100644 index 00000000..52bde4aa --- /dev/null +++ b/tests/act/nf-test.yml @@ -0,0 +1,44 @@ +name: Run nf-test +on: + push: + +env: + NFT_VER: "0.9.3" + NFT_WORKDIR: "~" + NXF_ANSI_LOG: false + NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity + NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity + +jobs: + nf-test: + name: nf-test + runs-on: local + env: + NXF_ANSI_LOG: false + + steps: + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + with: + fetch-depth: 0 + + - name: Get profile + id: get_profile + run: | + if [ -z "${{ env.PROFILE }}" ]; then + echo "Using default profile ${{ env.DEFAULT_PROFILE }}" + echo "profile=${{ env.DEFAULT_PROFILE }}" >> $GITHUB_OUTPUT + else + echo "Using profile ${{ env.PROFILE }}" + echo "profile=${{ env.PROFILE }}" >> $GITHUB_OUTPUT + fi + + - name: Run nf-test + id: run_nf_test + uses: ./tests/act/actions/nf-test + continue-on-error: ${{ matrix.NXF_VER == 'latest-everything' }} + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + NXF_VERSION: ${{ env.NXF_VER }} + with: + profile: ${{ steps.get_profile.outputs.profile }} + args: ${{ env.ARGS }} diff --git a/tests/act/params.env b/tests/act/params.env new file mode 100644 index 00000000..d99a6307 --- /dev/null +++ b/tests/act/params.env @@ -0,0 +1,6 @@ +# for best reproducibility, use the full image +IMAGE=catthehacker/ubuntu:full-24.04 +# on some systems, the base image can work +#IMAGE=catthehacker/ubuntu:act-24.04 +NXF_VER=25.04.0 +DEFAULT_PROFILE=docker diff --git a/tests/act/run b/tests/act/run new file mode 100755 index 00000000..7b6a36ef --- /dev/null +++ b/tests/act/run @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +act_folder=$(dirname "$(realpath $0)") +root_folder=$(dirname $(dirname "${act_folder}")) + +ACT_OUTPUT_FOLDERNAME="act_output" +act_output_folder="${root_folder}/${ACT_OUTPUT_FOLDERNAME}" +mkdir -p $act_output_folder +######################################### +# Parse arguments +######################################### + +args="" +profile="" +bind_args="" +while [[ $# -gt 0 ]]; do + if [[ "$1" == "--profile" ]]; then + profile="$2" + shift 2 + else + args="${args} $1" # append the string to args + shift + fi +done + +echo "Running with args: ${args} and profile(s): ${profile}" + +######################################### +# Run act +######################################### + +act push \ + --job nf-test \ + --directory "${root_folder}" \ + --env-file "${act_folder}/params.env" \ + --env ARGS="${args}" \ + --env PROFILE="${profile}" \ + --workflows "${act_folder}/nf-test.yml" \ + --platform local=catthehacker/ubuntu:act-24.04 \ + --container-architecture linux/amd64 \ + --container-options "--privileged" \ + --artifact-server-path "${act_output_folder}" diff --git a/tests/default.nf.test b/tests/default.nf.test index 43916ae9..58102a53 100644 --- a/tests/default.nf.test +++ b/tests/default.nf.test @@ -5,9 +5,143 @@ nextflow_pipeline { tag "pipeline" test("-profile test") { + tag "test" when { params { + species = 'prunus persica' + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + stable_name, + stable_path + ).match() } + ) + } + } + + test("-profile test_dataset_only") { + tag "test_dataset_only" + + when { + params { + species = 'mus musculus' + datasets = "https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/input_big.yaml" + skip_fetch_eatlas_accessions = true + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + stable_name, + stable_path + ).match() } + ) + } + } + + test("-profile test_public_and_dataset") { + tag "test_public_and_dataset" + + when { + params { + species = 'beta vulgaris' + keywords = "leaf" + datasets = "https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/input_beta_vulgaris.csv" + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + stable_name, + stable_path + ).match() } + ) + } + } + + /* + //TODO: see why it gives issues in CI + test("-profile test_fetch_geo") { + tag "test_fetch_geo" + + when { + params { + species = 'beta vulgaris' + fetch_geo_accessions = true + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + stable_name, + stable_path + ).match() } + ) + } + } + */ + + test("-profile test_accessions_only") { + tag "test_accessions_only" + + when { + params { + species = 'arabidopsis lyrata' + accessions_only = true + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } + + test("-profile test_download_only") { + tag "test_download_only" + + when { + params { + species = 'aspergillus fumigatus' + download_only = true outdir = "$outputDir" } } @@ -30,4 +164,146 @@ nextflow_pipeline { ) } } + + test("-profile test_one_accession_low_gene_count") { + tag "test_one_accession_low_gene_count" + + when { + params { + species = 'arabidopsis thaliana' + accessions = "E-GEOD-51720" + skip_fetch_eatlas_accessions = true + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + stable_name, + stable_path + ).match() } + ) + } + } + + test("-profile test_skip_id_mapping") { + tag "test_skip_id_mapping" + + when { + params { + species = 'solanum tuberosum' + datasets = "${projectDir}/tests/test_data/input_datasets/input.csv" + skip_id_mapping = true + skip_fetch_eatlas_accessions = true + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + stable_name, + stable_path + ).match() } + ) + } + } + + test("-profile test_dataset_custom_mapping_and_gene_length") { + tag "test_dataset_custom_mapping_and_gene_length" + + when { + params { + species = 'solanum tuberosum' + datasets = "${projectDir}/tests/test_data/input_datasets/input.csv" + skip_id_mapping = true + skip_fetch_eatlas_accessions = true + gene_id_mapping = "${projectDir}/tests/test_data/input_datasets/mapping.csv" + gene_metadata = "${projectDir}/tests/test_data/input_datasets/metadata.csv" + gene_length = "${projectDir}/tests/test_data/input_datasets/gene_lengths.csv" + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + stable_name, + stable_path + ).match() } + ) + } + } + /* + // TODO: see why this test works locally, even with act, but fails in CI + test("-profile test_included_and_excluded_accessions") { + tag "test_included_and_excluded_accessions" + + when { + params { + species = "solanum tuberosum" + accessions = "E-MTAB-552,E-GEOD-61690" + excluded_accessions = "E-MTAB-4251" + accessions_file = "${projectDir}/tests/test_data/misc/accessions_to_include.txt" + excluded_accessions_file = "${projectDir}/tests/test_data/misc/excluded_accessions.txt" + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_stableexpression_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } + */ + /* + test("-profile test_gprofiler_target_database_entrez") { + + when { + params { + species = 'beta vulgaris' + gprofiler_target_db = 'ENTREZGENE' + outdir = "$outputDir" + } + } + + then { + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + stable_name, + stable_path + ).match() } + ) + } + } + */ + } diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap new file mode 100644 index 00000000..87571ee3 --- /dev/null +++ b/tests/default.nf.test.snap @@ -0,0 +1,1704 @@ +{ + "-profile test_dataset_only": { + "content": [ + { + "AGGREGATE_RESULTS": { + "polars": "1.39.2", + "python": "3.14.3", + "pyyaml": "6.0.3" + }, + "CLEAN_GENE_IDS": { + "polars": "1.37.1", + "python": "3.12.8" + }, + "COLLECT_ALL_GENE_IDS": { + "python": "3.14.2", + "tqdm": "4.67.1" + }, + "COLLECT_STATISTICS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_GENE_TRANSCRIPT_LENGTHS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_M_MEASURE": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "COMPUTE_STABILITY_SCORES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "COMPUTE_TPM": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "CROSS_JOIN": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "DASH_APP": { + "python": "3.14.3", + "dash": "3.3.0", + "dash-extensions": "2.0.4", + "dash-mantine-components": "2.4.0", + "dash-ag-grid": "32.3.2", + "polars": "1.39.2", + "pandas": "2.3.3", + "pyarrow": "23.0.1", + "scipy": "1.17.1" + }, + "DESCRIPTIVE_STATISTICS": { + "polars": "1.37.1", + "python": "3.12.8" + }, + "DETECT_RARE_GENES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "DOWNLOAD_ENSEMBL_ANNOTATION": { + "bs4": "4.14.3", + "httpx": "0.28.1", + "pandas": "3.0.1", + "python": "3.14.3", + "tqdm": "4.67.3" + }, + "EXPRESSION_RATIO": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "EXTRACT_GENE_IDS": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "FILTER_AND_RENAME_GENES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "GET_CANDIDATE_GENES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "GLOBAL": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "GPROFILER_IDMAPPING": { + "httpx": "0.28.1", + "pandas": "3.0.1", + "python": "3.14.3" + }, + "IMPUTE_MISSING_VALUES": { + "polars": "1.39.2", + "python": "3.14.3", + "scikit-learn": "1.8.0" + }, + "MAKE_CHUNKS": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "NORMFINDER": { + "numba": "0.64.0", + "numpy": "2.4.3", + "polars": "1.39.2", + "python": "3.14.3", + "tqdm": "4.67.3" + }, + "PLATFORM": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "QUANTILE_NORMALISATION": { + "polars": "1.39.2", + "python": "3.14.3", + "scikit-learn": "1.8.0" + }, + "RATIO_STANDARD_VARIATION": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "TOO_MANY_MISSING_VALUES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "TOO_MANY_ZEROS": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "Workflow": { + "nf-core/stableexpression": "v1.0.0" + } + }, + [ + "aggregated", + "aggregated/all_genes_summary.csv", + "aggregated/custom_content_multiqc_config.yaml", + "aggregated/section_1.most_stable_genes_summary.csv", + "aggregated/section_1.most_stable_genes_transposed_counts.csv", + "aggregated/section_10.most_stable_genes_summary.csv", + "aggregated/section_10.most_stable_genes_transposed_counts.csv", + "aggregated/section_11.most_stable_genes_summary.csv", + "aggregated/section_11.most_stable_genes_transposed_counts.csv", + "aggregated/section_12.most_stable_genes_summary.csv", + "aggregated/section_12.most_stable_genes_transposed_counts.csv", + "aggregated/section_13.most_stable_genes_summary.csv", + "aggregated/section_13.most_stable_genes_transposed_counts.csv", + "aggregated/section_14.most_stable_genes_summary.csv", + "aggregated/section_14.most_stable_genes_transposed_counts.csv", + "aggregated/section_15.most_stable_genes_summary.csv", + "aggregated/section_15.most_stable_genes_transposed_counts.csv", + "aggregated/section_16.most_stable_genes_summary.csv", + "aggregated/section_16.most_stable_genes_transposed_counts.csv", + "aggregated/section_17.most_stable_genes_summary.csv", + "aggregated/section_17.most_stable_genes_transposed_counts.csv", + "aggregated/section_18.most_stable_genes_summary.csv", + "aggregated/section_18.most_stable_genes_transposed_counts.csv", + "aggregated/section_19.most_stable_genes_summary.csv", + "aggregated/section_19.most_stable_genes_transposed_counts.csv", + "aggregated/section_2.most_stable_genes_summary.csv", + "aggregated/section_2.most_stable_genes_transposed_counts.csv", + "aggregated/section_20.most_stable_genes_summary.csv", + "aggregated/section_20.most_stable_genes_transposed_counts.csv", + "aggregated/section_3.most_stable_genes_summary.csv", + "aggregated/section_3.most_stable_genes_transposed_counts.csv", + "aggregated/section_4.most_stable_genes_summary.csv", + "aggregated/section_4.most_stable_genes_transposed_counts.csv", + "aggregated/section_5.most_stable_genes_summary.csv", + "aggregated/section_5.most_stable_genes_transposed_counts.csv", + "aggregated/section_6.most_stable_genes_summary.csv", + "aggregated/section_6.most_stable_genes_transposed_counts.csv", + "aggregated/section_7.most_stable_genes_summary.csv", + "aggregated/section_7.most_stable_genes_transposed_counts.csv", + "aggregated/section_8.most_stable_genes_summary.csv", + "aggregated/section_8.most_stable_genes_transposed_counts.csv", + "aggregated/section_9.most_stable_genes_summary.csv", + "aggregated/section_9.most_stable_genes_transposed_counts.csv", + "dash_app", + "dash_app/app.py", + "dash_app/assets", + "dash_app/assets/style.css", + "dash_app/data", + "dash_app/data/all_counts.imputed.parquet", + "dash_app/data/all_genes_summary.csv", + "dash_app/data/whole_design.csv", + "dash_app/environment.yml", + "dash_app/src", + "dash_app/src/callbacks", + "dash_app/src/callbacks/common.py", + "dash_app/src/callbacks/genes.py", + "dash_app/src/callbacks/samples.py", + "dash_app/src/components", + "dash_app/src/components/graphs.py", + "dash_app/src/components/icons.py", + "dash_app/src/components/right_sidebar.py", + "dash_app/src/components/settings", + "dash_app/src/components/settings/genes.py", + "dash_app/src/components/settings/samples.py", + "dash_app/src/components/stores.py", + "dash_app/src/components/tables.py", + "dash_app/src/components/tooltips.py", + "dash_app/src/components/top.py", + "dash_app/src/utils", + "dash_app/src/utils/config.py", + "dash_app/src/utils/data_management.py", + "dash_app/src/utils/style.py", + "errors", + "gene_length", + "gene_length/Mus_musculus.GRCm39.115.chr.gff3.gz", + "gene_length/gene_transcript_lengths.csv", + "idmapping", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/gprofiler", + "idmapping/gprofiler/gene_metadata.csv", + "idmapping/gprofiler/mapped_gene_ids.csv", + "idmapping/renamed", + "idmapping/renamed/SRP254919.salmon.merged.gene_counts.top1000cov.assay.cleaned.renamed.parquet", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_gene_statistics.txt", + "multiqc/multiqc_data/multiqc_genes_section_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_10.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_11.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_12.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_13.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_14.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_15.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_16.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_17.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_18.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_19.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_2.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_3.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_4.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_5.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_6.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_7.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_8.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_9.txt", + "multiqc/multiqc_data/multiqc_id_mapping_stats.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_10.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_11.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_12.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_13.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_14.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_15.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_16.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_17.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_18.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_19.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_2.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_3.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_4.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_5.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_6.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_7.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_8.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_9.txt", + "multiqc/multiqc_data/multiqc_null_values_filter.txt", + "multiqc/multiqc_data/multiqc_ratio_nulls.txt", + "multiqc/multiqc_data/multiqc_ratio_zeros.txt", + "multiqc/multiqc_data/multiqc_renaming_warning_reasons.txt", + "multiqc/multiqc_data/multiqc_skewness.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_data/multiqc_total_gene_id_occurrence_quantiles.txt", + "multiqc/multiqc_data/multiqc_zero_values_filter.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/genes_section_1.pdf", + "multiqc/multiqc_plots/pdf/normalised_expr_distrib_section_1.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/genes_section_1.png", + "multiqc/multiqc_plots/png/normalised_expr_distrib_section_1.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/genes_section_1.svg", + "multiqc/multiqc_plots/svg/normalised_expr_distrib_section_1.svg", + "multiqc/multiqc_report.html", + "normalised", + "normalised/quantile_normalised", + "normalised/quantile_normalised/SRP254919.salmon.merged.gene_counts.top1000cov.assay", + "normalised/quantile_normalised/SRP254919.salmon.merged.gene_counts.top1000cov.assay/SRP254919.salmon.merged.gene_counts.top1000cov.assay.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.quant_norm.parquet", + "normalised/tpm", + "normalised/tpm/SRP254919.salmon.merged.gene_counts.top1000cov.assay", + "normalised/tpm/SRP254919.salmon.merged.gene_counts.top1000cov.assay/SRP254919.salmon.merged.gene_counts.top1000cov.assay.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "statistics", + "statistics/id_mapping_stats.csv", + "statistics/missing_values_filter_stats.csv", + "statistics/ratio_nulls.csv", + "statistics/ratio_nulls_per_sample.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "statistics/zero_values_filter_stats.csv", + "warnings", + "warnings/renaming_warning_reasons.tsv" + ], + [ + "all_genes_summary.csv:md5,67694aeb7cb1bec8e31a604fa5350783", + "custom_content_multiqc_config.yaml:md5,e048085491cb74658cf363545b1278fe", + "section_1.most_stable_genes_summary.csv:md5,f62b0f3c0462c8a6aeba61ed083ce07e", + "section_1.most_stable_genes_transposed_counts.csv:md5,7c197e7b57cdaee0b0250aed93050e24", + "section_10.most_stable_genes_summary.csv:md5,9b8dde024b554d9b2318bad7e4b76252", + "section_10.most_stable_genes_transposed_counts.csv:md5,6d9df5bef8a5b44340bbdc141d229f68", + "section_11.most_stable_genes_summary.csv:md5,255237f708e36bcd0290be1e811fd8b2", + "section_11.most_stable_genes_transposed_counts.csv:md5,08c786a9859eeae873dacbec33bb2e9e", + "section_12.most_stable_genes_summary.csv:md5,7597d72cce386c406bae974f6bccc089", + "section_12.most_stable_genes_transposed_counts.csv:md5,e93e7f64d0b5890ad95b5e0a9398f255", + "section_13.most_stable_genes_summary.csv:md5,5d0906956d78014f3bef28227e4af8f6", + "section_13.most_stable_genes_transposed_counts.csv:md5,93070ff7b4e36a2486a4b8b45f957ac8", + "section_14.most_stable_genes_summary.csv:md5,5e4743e56fbe09cc030f8401944d219c", + "section_14.most_stable_genes_transposed_counts.csv:md5,dd80d0a9118f20f47b3d06e479423713", + "section_15.most_stable_genes_summary.csv:md5,87a32675404d3908277e725954daf477", + "section_15.most_stable_genes_transposed_counts.csv:md5,0e6ed0bc24c70e3fca43691d87b39eec", + "section_16.most_stable_genes_summary.csv:md5,10b6a0507815b5a0f19371945cee71d9", + "section_16.most_stable_genes_transposed_counts.csv:md5,0dedfedd98859fabb0ba0bb57b08efce", + "section_17.most_stable_genes_summary.csv:md5,6e9ea1f25adaafdeec78ad3419815b68", + "section_17.most_stable_genes_transposed_counts.csv:md5,85a7c2959d89324cc6f250493d4520a7", + "section_18.most_stable_genes_summary.csv:md5,cafcd5fff5842789dc2024c9ec2b45d8", + "section_18.most_stable_genes_transposed_counts.csv:md5,69a36fb3843257e1c67fe06282a22671", + "section_19.most_stable_genes_summary.csv:md5,1a2f4da7114df0cdc08c09f024718d0c", + "section_19.most_stable_genes_transposed_counts.csv:md5,4e4b2100fca4c6d461333edf6e5fd9e7", + "section_2.most_stable_genes_summary.csv:md5,74094f7f58405bcfce2972a1073db7ee", + "section_2.most_stable_genes_transposed_counts.csv:md5,67e877f84141462327383d622cab7b49", + "section_20.most_stable_genes_summary.csv:md5,57068c5336541f1cfa8ae699f098b0b6", + "section_20.most_stable_genes_transposed_counts.csv:md5,5691275bafee54ec2520d82011295495", + "section_3.most_stable_genes_summary.csv:md5,537ce26b668dceffd0c23126fb97cef4", + "section_3.most_stable_genes_transposed_counts.csv:md5,907103b9138b1d27b6a17d321ca59bca", + "section_4.most_stable_genes_summary.csv:md5,cb353777ed864969add3036f4aa664ea", + "section_4.most_stable_genes_transposed_counts.csv:md5,2794b4e29fa21c53604cfc189ce6ecdf", + "section_5.most_stable_genes_summary.csv:md5,667468cdafb70c899dd3f7b7cb603ba6", + "section_5.most_stable_genes_transposed_counts.csv:md5,4d4395309a84a820d6c3169906824657", + "section_6.most_stable_genes_summary.csv:md5,62d2ade9627014c122f22bc1a32776a6", + "section_6.most_stable_genes_transposed_counts.csv:md5,73806b0e960342be7378aface0618229", + "section_7.most_stable_genes_summary.csv:md5,56a27a3d372727d3a0dfdef023f9aad2", + "section_7.most_stable_genes_transposed_counts.csv:md5,2067a0574e51f1fe8bf590addbfd6ea9", + "section_8.most_stable_genes_summary.csv:md5,79c41e03a858dd68a5614b1343938c24", + "section_8.most_stable_genes_transposed_counts.csv:md5,35e02009d4aff77f0fca02711a8ad058", + "section_9.most_stable_genes_summary.csv:md5,0e7776950594471fc8dfb914eed0a17a", + "section_9.most_stable_genes_transposed_counts.csv:md5,e102dcbb3bdcec1bd724adbd772655d1", + "style.css:md5,e6ba182eaf06980dbda49920efbf6e64", + "all_genes_summary.csv:md5,67694aeb7cb1bec8e31a604fa5350783", + "whole_design.csv:md5,f29515bc2c783e593fb9028127342593", + "environment.yml:md5,dd081780e1f98d34b13289d019f8bb5b", + "Mus_musculus.GRCm39.115.chr.gff3.gz:md5,66a5d70eeb2ce9685ca871fc7b0f4f96", + "gene_transcript_lengths.csv:md5,09e2d2a8881df9aa96ee71802e9c3451", + "global_gene_id_mapping.csv:md5,78934d2ac5fe7d863f114c5703f57a06", + "global_gene_metadata.csv:md5,bd76860b422e45eca7cd583212a977d2", + "gene_metadata.csv:md5,bd76860b422e45eca7cd583212a977d2", + "mapped_gene_ids.csv:md5,78934d2ac5fe7d863f114c5703f57a06", + "whole_design.csv:md5,f29515bc2c783e593fb9028127342593", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_gene_statistics.txt:md5,9285ae2cfc531a0987e3172be0aa6483", + "multiqc_genes_section_1.txt:md5,3d44381703173383e455e59e84c3ecd9", + "multiqc_genes_section_1_1.txt:md5,49f24363739b3b2952ad813b0a1dc7c9", + "multiqc_genes_section_1_10.txt:md5,23eda043b3773d143f42c79943785baa", + "multiqc_genes_section_1_11.txt:md5,f221ca530b336e01b6c5a7f3a74b4262", + "multiqc_genes_section_1_12.txt:md5,df9f14313b281daae80438b134f25326", + "multiqc_genes_section_1_13.txt:md5,186cbc4df4a2d8bb05ff272725ad573e", + "multiqc_genes_section_1_14.txt:md5,efb4bfa55b0981b5b683a5d3bcf4fee3", + "multiqc_genes_section_1_15.txt:md5,cc47d6728943dafbdf0fa65ef0d075d9", + "multiqc_genes_section_1_16.txt:md5,35b7292de1819b89ece3df46081e6db3", + "multiqc_genes_section_1_17.txt:md5,8468a21c35f53afffe1f3f5a49f56aa7", + "multiqc_genes_section_1_18.txt:md5,8f2906665b62c75ab1786214124c02d1", + "multiqc_genes_section_1_19.txt:md5,d879c38bd6016d13a55adf59b4de7d99", + "multiqc_genes_section_1_2.txt:md5,58126f33166e756917d3fca0c66aafa8", + "multiqc_genes_section_1_3.txt:md5,dded6f5abee4f377eb093d6b95b6daa0", + "multiqc_genes_section_1_4.txt:md5,2c9a3ba7a78140a1e01afcc15b35c835", + "multiqc_genes_section_1_5.txt:md5,cddedab71e149bb731d6dee130dbea65", + "multiqc_genes_section_1_6.txt:md5,8c21ab38a3c761b0fefc029812b1cc35", + "multiqc_genes_section_1_7.txt:md5,d5779f1d0e92d80de7dc728e375d57ee", + "multiqc_genes_section_1_8.txt:md5,66e168eafc2bd8be9e9a397d0fc1c4b9", + "multiqc_genes_section_1_9.txt:md5,62a4f655b779f14b751e47f32bf7ccf1", + "multiqc_id_mapping_stats.txt:md5,600e9fa5656a06a3288ea7e6d9fef647", + "multiqc_normalised_expr_distrib_section_1.txt:md5,342306198c5930791d9255b481b6daa8", + "multiqc_normalised_expr_distrib_section_1_1.txt:md5,3a1b52103dd52cceeede5b99f0c18d1c", + "multiqc_normalised_expr_distrib_section_1_10.txt:md5,69d9ea655368e4b61ea3c49dc336ccc3", + "multiqc_normalised_expr_distrib_section_1_11.txt:md5,cf5fcd0fb87409255e88f808993570a8", + "multiqc_normalised_expr_distrib_section_1_12.txt:md5,e0767d376933c8849a08ba998acaee39", + "multiqc_normalised_expr_distrib_section_1_13.txt:md5,76d7fd5652e923ad09000bc80f9aa4ca", + "multiqc_normalised_expr_distrib_section_1_14.txt:md5,ce675ff07c194b1c975879f3288a27e8", + "multiqc_normalised_expr_distrib_section_1_15.txt:md5,5936ef07006c81b688289ec764994932", + "multiqc_normalised_expr_distrib_section_1_16.txt:md5,1a5a56d8661bcc0f986058a90ebb81b0", + "multiqc_normalised_expr_distrib_section_1_17.txt:md5,8c4c83bdcc648b4cbd5876c58d8c30d8", + "multiqc_normalised_expr_distrib_section_1_18.txt:md5,832ad87d6f4689a459e443a032f67f9d", + "multiqc_normalised_expr_distrib_section_1_19.txt:md5,83be81b32d7c48685015c2ede21fb511", + "multiqc_normalised_expr_distrib_section_1_2.txt:md5,f1e22de47e393569f3d193a30bbdc9cd", + "multiqc_normalised_expr_distrib_section_1_3.txt:md5,9e190919da3fc866beb78076ad8c4a33", + "multiqc_normalised_expr_distrib_section_1_4.txt:md5,3712b42a48fa257ad75f2deba10f631d", + "multiqc_normalised_expr_distrib_section_1_5.txt:md5,d927439c8e03a2e6f2ad18f16a90afff", + "multiqc_normalised_expr_distrib_section_1_6.txt:md5,b2d87c20485c9ffdd0237c4372d9d6e9", + "multiqc_normalised_expr_distrib_section_1_7.txt:md5,d18c331b910ce9702b5a977521c39aa1", + "multiqc_normalised_expr_distrib_section_1_8.txt:md5,1a6c7f079559251385f268beee39c9cb", + "multiqc_normalised_expr_distrib_section_1_9.txt:md5,688908e507b313d477957cb1d7d6e1a2", + "multiqc_null_values_filter.txt:md5,64ca3e3acc613e1b85733fd847712a37", + "multiqc_ratio_nulls.txt:md5,7063b06cadcf854671bc9cefb51a6fe3", + "multiqc_ratio_zeros.txt:md5,7063b06cadcf854671bc9cefb51a6fe3", + "multiqc_renaming_warning_reasons.txt:md5,6e3001e79809e518b23efc517fc5bc67", + "multiqc_total_gene_id_occurrence_quantiles.txt:md5,ca154d649786ea5336e7c9e980f00eac", + "multiqc_zero_values_filter.txt:md5,64ca3e3acc613e1b85733fd847712a37", + "id_mapping_stats.csv:md5,b47d6ebd34e3fb11a40665b0a38db3da", + "missing_values_filter_stats.csv:md5,310182ec872cf37ffb81370dfcd01207", + "ratio_nulls.csv:md5,2272ebcf58ac8bb283d238f87d508b96", + "ratio_nulls_per_sample.csv:md5,375371c6d3e58ae69430f0e96b71920d", + "ratio_zeros.csv:md5,2272ebcf58ac8bb283d238f87d508b96", + "zero_values_filter_stats.csv:md5,310182ec872cf37ffb81370dfcd01207", + "renaming_warning_reasons.tsv:md5,0a11a59b5b547a39ab7a0e4dac622173" + ] + ], + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-05T09:10:44.402386214" + }, + "-profile test_skip_id_mapping": { + "content": [ + [ + "errors", + "gene_length", + "gene_length/Solanum_tuberosum.SolTub_3.0.62.gff3.gz", + "gene_length/gene_transcript_lengths.csv", + "idmapping", + "idmapping/gene_ids.txt", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "normalised", + "normalised/quantile_normalised", + "normalised/quantile_normalised/microarray.normalised", + "normalised/quantile_normalised/microarray.normalised/microarray.normalised.zeros_filtered.nulls_filtered.quant_norm.parquet", + "normalised/quantile_normalised/rnaseq.raw", + "normalised/quantile_normalised/rnaseq.raw/rnaseq.raw.zeros_filtered.nulls_filtered.tpm.quant_norm.parquet", + "normalised/tpm", + "normalised/tpm/rnaseq.raw", + "normalised/tpm/rnaseq.raw/rnaseq.raw.zeros_filtered.nulls_filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "statistics", + "statistics/missing_values_filter_stats.csv", + "statistics/ratio_nulls.csv", + "statistics/ratio_nulls_per_sample.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "statistics/zero_values_filter_stats.csv", + "warnings" + ], + [ + "Solanum_tuberosum.SolTub_3.0.62.gff3.gz:md5,cca99141f43d57d697f6df75de790e05", + "gene_transcript_lengths.csv:md5,217aa7c1e227ce2f78a905138d8e5b39", + "gene_ids.txt:md5,831b47f91a0808802967aa0e53a25de9", + "whole_design.csv:md5,70d6c2673e619ca52d2774fb3e368382", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "missing_values_filter_stats.csv:md5,ebad5386e7c670ff04887eff67c8faae", + "ratio_nulls.csv:md5,ab65c49c9b8ba7e242f391438789e080", + "ratio_nulls_per_sample.csv:md5,5c2931cb8c5ecb27ffa9136628fc714c", + "ratio_zeros.csv:md5,1837a5a03a551fdb0a7bba2869157559", + "zero_values_filter_stats.csv:md5,ebad5386e7c670ff04887eff67c8faae" + ] + ], + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-04T22:10:20.013206958" + }, + "-profile test_dataset_custom_mapping_and_gene_length": { + "content": [ + { + "EXTRACT_GENE_IDS": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "FILTER_AND_RENAME_GENES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "Workflow": { + "nf-core/stableexpression": "v1.0.0" + } + }, + [ + "errors", + "errors/renaming_failure_reasons.tsv", + "idmapping", + "idmapping/gene_ids.txt", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/renamed", + "merged_datasets", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "statistics", + "statistics/id_mapping_stats.csv", + "warnings" + ], + [ + "renaming_failure_reasons.tsv:md5,d5cae52d86b44b02d7bd00c456576b5d", + "gene_ids.txt:md5,831b47f91a0808802967aa0e53a25de9", + "global_gene_id_mapping.csv:md5,187a86074197044846bb8565e122eb8e", + "global_gene_metadata.csv:md5,5ae2d701ca0cb6384d9e1e08a345e452", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "id_mapping_stats.csv:md5,20bd1443c864cb013c97efc760465e9c" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-21T12:53:02.926804675" + }, + "-profile test": { + "content": [ + [ + "aggregated", + "aggregated/all_genes_summary.csv", + "aggregated/custom_content_multiqc_config.yaml", + "aggregated/section_1.most_stable_genes_summary.csv", + "aggregated/section_1.most_stable_genes_transposed_counts.csv", + "aggregated/section_10.most_stable_genes_summary.csv", + "aggregated/section_10.most_stable_genes_transposed_counts.csv", + "aggregated/section_11.most_stable_genes_summary.csv", + "aggregated/section_11.most_stable_genes_transposed_counts.csv", + "aggregated/section_12.most_stable_genes_summary.csv", + "aggregated/section_12.most_stable_genes_transposed_counts.csv", + "aggregated/section_13.most_stable_genes_summary.csv", + "aggregated/section_13.most_stable_genes_transposed_counts.csv", + "aggregated/section_14.most_stable_genes_summary.csv", + "aggregated/section_14.most_stable_genes_transposed_counts.csv", + "aggregated/section_15.most_stable_genes_summary.csv", + "aggregated/section_15.most_stable_genes_transposed_counts.csv", + "aggregated/section_16.most_stable_genes_summary.csv", + "aggregated/section_16.most_stable_genes_transposed_counts.csv", + "aggregated/section_17.most_stable_genes_summary.csv", + "aggregated/section_17.most_stable_genes_transposed_counts.csv", + "aggregated/section_18.most_stable_genes_summary.csv", + "aggregated/section_18.most_stable_genes_transposed_counts.csv", + "aggregated/section_19.most_stable_genes_summary.csv", + "aggregated/section_19.most_stable_genes_transposed_counts.csv", + "aggregated/section_2.most_stable_genes_summary.csv", + "aggregated/section_2.most_stable_genes_transposed_counts.csv", + "aggregated/section_20.most_stable_genes_summary.csv", + "aggregated/section_20.most_stable_genes_transposed_counts.csv", + "aggregated/section_3.most_stable_genes_summary.csv", + "aggregated/section_3.most_stable_genes_transposed_counts.csv", + "aggregated/section_4.most_stable_genes_summary.csv", + "aggregated/section_4.most_stable_genes_transposed_counts.csv", + "aggregated/section_5.most_stable_genes_summary.csv", + "aggregated/section_5.most_stable_genes_transposed_counts.csv", + "aggregated/section_6.most_stable_genes_summary.csv", + "aggregated/section_6.most_stable_genes_transposed_counts.csv", + "aggregated/section_7.most_stable_genes_summary.csv", + "aggregated/section_7.most_stable_genes_transposed_counts.csv", + "aggregated/section_8.most_stable_genes_summary.csv", + "aggregated/section_8.most_stable_genes_transposed_counts.csv", + "aggregated/section_9.most_stable_genes_summary.csv", + "aggregated/section_9.most_stable_genes_transposed_counts.csv", + "dash_app", + "dash_app/app.py", + "dash_app/assets", + "dash_app/assets/style.css", + "dash_app/data", + "dash_app/data/all_counts.imputed.parquet", + "dash_app/data/all_genes_summary.csv", + "dash_app/data/whole_design.csv", + "dash_app/environment.yml", + "dash_app/src", + "dash_app/src/callbacks", + "dash_app/src/callbacks/common.py", + "dash_app/src/callbacks/genes.py", + "dash_app/src/callbacks/samples.py", + "dash_app/src/components", + "dash_app/src/components/graphs.py", + "dash_app/src/components/icons.py", + "dash_app/src/components/right_sidebar.py", + "dash_app/src/components/settings", + "dash_app/src/components/settings/genes.py", + "dash_app/src/components/settings/samples.py", + "dash_app/src/components/stores.py", + "dash_app/src/components/tables.py", + "dash_app/src/components/tooltips.py", + "dash_app/src/components/top.py", + "dash_app/src/utils", + "dash_app/src/utils/config.py", + "dash_app/src/utils/data_management.py", + "dash_app/src/utils/style.py", + "errors", + "gene_length", + "gene_length/Prunus_persica.Prunus_persica_NCBIv2.62.chr.gff3.gz", + "gene_length/gene_transcript_lengths.csv", + "idmapping", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/gprofiler", + "idmapping/gprofiler/gene_metadata.csv", + "idmapping/gprofiler/mapped_gene_ids.csv", + "idmapping/renamed", + "idmapping/renamed/E_ENAD_3_rnaseq.rnaseq.raw.counts.cleaned.renamed.parquet", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_eatlas_all_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_eatlas_selected_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_gene_statistics.txt", + "multiqc/multiqc_data/multiqc_genes_section_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_10.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_11.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_12.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_13.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_14.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_15.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_16.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_17.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_18.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_19.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_2.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_3.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_4.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_5.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_6.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_7.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_8.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_9.txt", + "multiqc/multiqc_data/multiqc_id_mapping_stats.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_10.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_11.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_12.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_13.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_14.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_15.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_16.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_17.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_18.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_19.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_2.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_3.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_4.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_5.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_6.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_7.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_8.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_9.txt", + "multiqc/multiqc_data/multiqc_null_values_filter.txt", + "multiqc/multiqc_data/multiqc_ratio_nulls.txt", + "multiqc/multiqc_data/multiqc_ratio_zeros.txt", + "multiqc/multiqc_data/multiqc_skewness.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_data/multiqc_total_gene_id_occurrence_quantiles.txt", + "multiqc/multiqc_data/multiqc_zero_values_filter.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/genes_section_1.pdf", + "multiqc/multiqc_plots/pdf/normalised_expr_distrib_section_1.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/genes_section_1.png", + "multiqc/multiqc_plots/png/normalised_expr_distrib_section_1.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/genes_section_1.svg", + "multiqc/multiqc_plots/svg/normalised_expr_distrib_section_1.svg", + "multiqc/multiqc_report.html", + "normalised", + "normalised/quantile_normalised", + "normalised/quantile_normalised/E_ENAD_3_rnaseq", + "normalised/quantile_normalised/E_ENAD_3_rnaseq/E_ENAD_3_rnaseq.rnaseq.raw.counts.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.quant_norm.parquet", + "normalised/tpm", + "normalised/tpm/E_ENAD_3_rnaseq", + "normalised/tpm/E_ENAD_3_rnaseq/E_ENAD_3_rnaseq.rnaseq.raw.counts.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/accessions", + "public_data/expression_atlas/accessions/accessions.txt", + "public_data/expression_atlas/accessions/selected_experiments.metadata.tsv", + "public_data/expression_atlas/accessions/species_experiments.metadata.tsv", + "public_data/expression_atlas/datasets", + "public_data/expression_atlas/datasets/E_ENAD_3_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_ENAD_3_rnaseq.rnaseq.raw.counts.csv", + "statistics", + "statistics/id_mapping_stats.csv", + "statistics/missing_values_filter_stats.csv", + "statistics/ratio_nulls.csv", + "statistics/ratio_nulls_per_sample.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "statistics/zero_values_filter_stats.csv", + "warnings" + ], + [ + "all_genes_summary.csv:md5,50ab37894673a3ff7e7b9cdf70038616", + "custom_content_multiqc_config.yaml:md5,e048085491cb74658cf363545b1278fe", + "section_1.most_stable_genes_summary.csv:md5,574a24ab5764c4084a03a43665efdfeb", + "section_1.most_stable_genes_transposed_counts.csv:md5,f951d9c40133ae0eec2c1592e27c9e01", + "section_10.most_stable_genes_summary.csv:md5,7192f72f608ad9741b213679dc28937c", + "section_10.most_stable_genes_transposed_counts.csv:md5,b8fbfd01d8ba5eba5e4b0f77ae3039e0", + "section_11.most_stable_genes_summary.csv:md5,288d33ee6cb4e5f30a53078c8f0e8a3a", + "section_11.most_stable_genes_transposed_counts.csv:md5,c07f4d994881730e357516b7711085c8", + "section_12.most_stable_genes_summary.csv:md5,63a6e5293e968b73f34fa6fe04ce5a7c", + "section_12.most_stable_genes_transposed_counts.csv:md5,54bf16cc1b84d8c3e6147c347ea19747", + "section_13.most_stable_genes_summary.csv:md5,f5a0871521572851601550dea5b6abb5", + "section_13.most_stable_genes_transposed_counts.csv:md5,5f6857a2334ba3f30f35d3223d9965ac", + "section_14.most_stable_genes_summary.csv:md5,4b5169a450a787855427aef0ec52348e", + "section_14.most_stable_genes_transposed_counts.csv:md5,a33fa8b001d43aed1c4ab77250b3cc7a", + "section_15.most_stable_genes_summary.csv:md5,bf792e95cf332e1c17587b4fbaebdcfb", + "section_15.most_stable_genes_transposed_counts.csv:md5,404bd099df6631b9a6c292edc0ab7ed2", + "section_16.most_stable_genes_summary.csv:md5,280f21317a5993d6e8fce796c63040d5", + "section_16.most_stable_genes_transposed_counts.csv:md5,a88ac2ef0471041f8d4ce3471c591192", + "section_17.most_stable_genes_summary.csv:md5,8a971c43f72f9d19124920fc39e68d46", + "section_17.most_stable_genes_transposed_counts.csv:md5,38b0d132084d5aa1f32b02a9448f66c3", + "section_18.most_stable_genes_summary.csv:md5,9481961b11d8386ea013d4fbbcab9ce9", + "section_18.most_stable_genes_transposed_counts.csv:md5,b08d66dbe529a0038a09cd6c435a604c", + "section_19.most_stable_genes_summary.csv:md5,d6ebc49aba6db637c09e973b7e2a1cd4", + "section_19.most_stable_genes_transposed_counts.csv:md5,3e13b4a9e1564e1b23106f7b5b6588ae", + "section_2.most_stable_genes_summary.csv:md5,4fa8e287cb65fcc13a18995973a44d1e", + "section_2.most_stable_genes_transposed_counts.csv:md5,3bce723d7b67560b14352acef626775e", + "section_20.most_stable_genes_summary.csv:md5,41b8421059ca95352617184211753e86", + "section_20.most_stable_genes_transposed_counts.csv:md5,f3715dd47e78d261c38339135703cf12", + "section_3.most_stable_genes_summary.csv:md5,42b235930a27a23e803667fdde064488", + "section_3.most_stable_genes_transposed_counts.csv:md5,df41c6dbece0d4885440ec392ffa261d", + "section_4.most_stable_genes_summary.csv:md5,21360b4a811c29224f2d79a7cdb42059", + "section_4.most_stable_genes_transposed_counts.csv:md5,e49241be359433067b5fc3c747e57030", + "section_5.most_stable_genes_summary.csv:md5,393bb0c32166eb09857e8e4d78c53cea", + "section_5.most_stable_genes_transposed_counts.csv:md5,fd09fa61f02c8721d09cd11dacf7ffe4", + "section_6.most_stable_genes_summary.csv:md5,34b575768b6d23a7154c021be5062a99", + "section_6.most_stable_genes_transposed_counts.csv:md5,c5773c2ff340782d1a3acdfc50a133d3", + "section_7.most_stable_genes_summary.csv:md5,414dd8b188f9dd88acc5b53964a10c6b", + "section_7.most_stable_genes_transposed_counts.csv:md5,2b1460cc2c6cd34a1637754fd17d7eec", + "section_8.most_stable_genes_summary.csv:md5,f8835325014d5adc54e3b794edfb4771", + "section_8.most_stable_genes_transposed_counts.csv:md5,7deab925d9006a6f9df005476520f4b7", + "section_9.most_stable_genes_summary.csv:md5,93cdb58ce05f2acc7732f272405a3937", + "section_9.most_stable_genes_transposed_counts.csv:md5,f2e698d07a70f870c06bab83786ab04c", + "style.css:md5,e6ba182eaf06980dbda49920efbf6e64", + "all_genes_summary.csv:md5,50ab37894673a3ff7e7b9cdf70038616", + "whole_design.csv:md5,e6b2a08b65fa02b470829a652593e161", + "environment.yml:md5,dd081780e1f98d34b13289d019f8bb5b", + "Prunus_persica.Prunus_persica_NCBIv2.62.chr.gff3.gz:md5,a333b20996b221f18e1c25aebd1c89f1", + "gene_transcript_lengths.csv:md5,e9376e922e381f3b52fd0cfa4ba95605", + "global_gene_id_mapping.csv:md5,dddb3e3dfbd118a6152837df2160b3a8", + "global_gene_metadata.csv:md5,8a93b691384b34e7d2cc30781532f952", + "gene_metadata.csv:md5,8a93b691384b34e7d2cc30781532f952", + "mapped_gene_ids.csv:md5,dddb3e3dfbd118a6152837df2160b3a8", + "whole_design.csv:md5,e6b2a08b65fa02b470829a652593e161", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_eatlas_all_experiments_metadata.txt:md5,e3c379628a87ad8b3f1e5cfe7310fcb0", + "multiqc_eatlas_selected_experiments_metadata.txt:md5,e3c379628a87ad8b3f1e5cfe7310fcb0", + "multiqc_gene_statistics.txt:md5,5acdc3e3c5ee949585746122042e3a95", + "multiqc_genes_section_1.txt:md5,3c48edce816118087fad975c3b2e150e", + "multiqc_genes_section_1_1.txt:md5,60962ac96142f6d818e6e552e49acfbf", + "multiqc_genes_section_1_10.txt:md5,8b5656d0384954be77d68b52b352c274", + "multiqc_genes_section_1_11.txt:md5,291d6d3087e97955aed29c49c8212ead", + "multiqc_genes_section_1_12.txt:md5,91698e69c64018fd72957c490f7ff3ff", + "multiqc_genes_section_1_13.txt:md5,9871fd84ac2bad8f740ce5aa42097db4", + "multiqc_genes_section_1_14.txt:md5,4ecd956786bd45a8412c72421b0ba071", + "multiqc_genes_section_1_15.txt:md5,7c84424999bcbb32f93898dcbdcfc2c3", + "multiqc_genes_section_1_16.txt:md5,bfc047dce5c21deca584138e89c6c993", + "multiqc_genes_section_1_17.txt:md5,167a36544d5cd92e9a178e92b91b2f42", + "multiqc_genes_section_1_18.txt:md5,f41528e9d8a43b4184ab64d2f17bb52a", + "multiqc_genes_section_1_19.txt:md5,2b8ae5d9a6f2562a29e153325c0813ae", + "multiqc_genes_section_1_2.txt:md5,6d5d700f547ed93498b75ca9d7e3a8b6", + "multiqc_genes_section_1_3.txt:md5,9d9d88b1065e6b7337b14683d08e6247", + "multiqc_genes_section_1_4.txt:md5,0aa0e21163e065cef0d5cdc437f23fbd", + "multiqc_genes_section_1_5.txt:md5,06b5f26e442498b658106df4239a2bf9", + "multiqc_genes_section_1_6.txt:md5,04be4ff16ce543c2bf6b40f4686fb46b", + "multiqc_genes_section_1_7.txt:md5,6a88f35e493baf1409ea9590f4f33e8e", + "multiqc_genes_section_1_8.txt:md5,fc195cb44c051684ac8a04bc42f1d563", + "multiqc_genes_section_1_9.txt:md5,fbfe2930503e42809f25fa1d8022d554", + "multiqc_id_mapping_stats.txt:md5,eebd614817761551497a4210022da830", + "multiqc_normalised_expr_distrib_section_1.txt:md5,643038ea4a668569e988bd0aac4a53e0", + "multiqc_normalised_expr_distrib_section_1_1.txt:md5,e78c321fe74010ffc43e1a24699c63c5", + "multiqc_normalised_expr_distrib_section_1_10.txt:md5,c6362429c6f181bb42c874a7a01f9704", + "multiqc_normalised_expr_distrib_section_1_11.txt:md5,adda3b5049d3444aace2ce78381d7df0", + "multiqc_normalised_expr_distrib_section_1_12.txt:md5,71d7deda66c9084a5f28e27061d66f58", + "multiqc_normalised_expr_distrib_section_1_13.txt:md5,0517c3cd42512d6d6d9da151cafd5ef9", + "multiqc_normalised_expr_distrib_section_1_14.txt:md5,6ddb07bfcb9c80045847b9f392cc49e5", + "multiqc_normalised_expr_distrib_section_1_15.txt:md5,17dce6dc28fab7606a9664249575a9a8", + "multiqc_normalised_expr_distrib_section_1_16.txt:md5,289b63cd2b4ed35780597b3db595a506", + "multiqc_normalised_expr_distrib_section_1_17.txt:md5,02bdf8ead4c248f5fcb0f80837d01140", + "multiqc_normalised_expr_distrib_section_1_18.txt:md5,2f8835d4cf0ecc6cdff09368f511aceb", + "multiqc_normalised_expr_distrib_section_1_19.txt:md5,d3ca94b2b22de241043d0dc8427cd916", + "multiqc_normalised_expr_distrib_section_1_2.txt:md5,c67b04d423c08f50abb7aded61e3fee8", + "multiqc_normalised_expr_distrib_section_1_3.txt:md5,bf5750a74ede44bebb3820bab622a932", + "multiqc_normalised_expr_distrib_section_1_4.txt:md5,6223117ebef835bd7f17e5a39015f97e", + "multiqc_normalised_expr_distrib_section_1_5.txt:md5,ab5ef1ab638a16b554baa16711f2fee4", + "multiqc_normalised_expr_distrib_section_1_6.txt:md5,42231476381c9522d4abef2219b3a376", + "multiqc_normalised_expr_distrib_section_1_7.txt:md5,c8cf57807b257529733df840084fab80", + "multiqc_normalised_expr_distrib_section_1_8.txt:md5,2291b32d9fc682a4bd8f7e74f181304f", + "multiqc_normalised_expr_distrib_section_1_9.txt:md5,88f2fd190de04ea355afdedd03184068", + "multiqc_null_values_filter.txt:md5,9f39dde761d2be72b989b3da51d9b768", + "multiqc_ratio_nulls.txt:md5,fd9acca0d6995183d30b6ef2489a596e", + "multiqc_ratio_zeros.txt:md5,7de1385d524ab670ffa9ddaf4cb8735b", + "multiqc_total_gene_id_occurrence_quantiles.txt:md5,4f5408fbf6cedc9160035dc0322781dd", + "multiqc_zero_values_filter.txt:md5,9f39dde761d2be72b989b3da51d9b768", + "accessions.txt:md5,c1a80aeb676be48beefb1df23a80da5b", + "selected_experiments.metadata.tsv:md5,f0f38d66449a88b9b95347d2d2ab4c68", + "species_experiments.metadata.tsv:md5,f0f38d66449a88b9b95347d2d2ab4c68", + "E_ENAD_3_rnaseq.design.csv:md5,376b9275a5b04892372bdecd1e51738d", + "E_ENAD_3_rnaseq.rnaseq.raw.counts.csv:md5,856b851cb139c3ddef5883efe2d85cfe", + "id_mapping_stats.csv:md5,574b7a35bd7fb4bb0122e5c7742d2b64", + "missing_values_filter_stats.csv:md5,a6fdf3c5250dc46a43f79a9ec5a1355d", + "ratio_nulls.csv:md5,f9d04a18f447dc5df3076cd3cebec755", + "ratio_nulls_per_sample.csv:md5,9606b597f36ab94930be607b2207c88b", + "ratio_zeros.csv:md5,72926e3905e83dd22a5a2dc62aa98d1d", + "zero_values_filter_stats.csv:md5,a6fdf3c5250dc46a43f79a9ec5a1355d" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-16T11:45:04.163764784" + }, + "-profile test_accessions_only": { + "content": [ + { + "EXPRESSION_ATLAS": { + "httpx": "0.28.1", + "nltk": "3.9.2", + "pandas": "3.0.1", + "python": "3.14.3", + "pyyaml": "6.0.3" + }, + "Workflow": { + "nf-core/stableexpression": "v1.0.0" + } + }, + [ + "errors", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/accessions", + "public_data/expression_atlas/accessions/accessions.txt", + "public_data/expression_atlas/accessions/selected_experiments.metadata.tsv", + "public_data/expression_atlas/accessions/species_experiments.metadata.tsv", + "statistics", + "warnings" + ], + [ + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "accessions.txt:md5,b63b86661ce5d73c1c95436ebaf146a1", + "selected_experiments.metadata.tsv:md5,2af519969d9c77cc74d92723ee171c48", + "species_experiments.metadata.tsv:md5,2af519969d9c77cc74d92723ee171c48" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-16T11:58:55.484363676" + }, + "-profile test_one_accession_low_gene_count": { + "content": [ + { + "AGGREGATE_RESULTS": { + "polars": "1.39.2", + "python": "3.14.3", + "pyyaml": "6.0.3" + }, + "CLEAN_GENE_IDS": { + "polars": "1.37.1", + "python": "3.12.8" + }, + "COLLECT_ALL_GENE_IDS": { + "python": "3.14.2", + "tqdm": "4.67.1" + }, + "COLLECT_STATISTICS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_GENE_TRANSCRIPT_LENGTHS": { + "pandas": "2.3.3", + "python": "3.13.7" + }, + "COMPUTE_M_MEASURE": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "COMPUTE_STABILITY_SCORES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "COMPUTE_TPM": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "CROSS_JOIN": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "DASH_APP": { + "python": "3.14.3", + "dash": "3.3.0", + "dash-extensions": "2.0.4", + "dash-mantine-components": "2.4.0", + "dash-ag-grid": "32.3.2", + "polars": "1.39.2", + "pandas": "2.3.3", + "pyarrow": "23.0.1", + "scipy": "1.17.1" + }, + "DESCRIPTIVE_STATISTICS": { + "polars": "1.37.1", + "python": "3.12.8" + }, + "DETECT_RARE_GENES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "DOWNLOAD_ENSEMBL_ANNOTATION": { + "bs4": "4.14.3", + "httpx": "0.28.1", + "pandas": "3.0.1", + "python": "3.14.3", + "tqdm": "4.67.3" + }, + "EXPRESSION_ATLAS": { + "ExpressionAtlas": "1.34.0", + "R": "4.4.3 (2025-02-28)" + }, + "EXPRESSION_RATIO": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "EXTRACT_GENE_IDS": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "FILTER_AND_RENAME_GENES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "GET_CANDIDATE_GENES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "GLOBAL": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "GPROFILER_IDMAPPING": { + "httpx": "0.28.1", + "pandas": "3.0.1", + "python": "3.14.3" + }, + "IMPUTE_MISSING_VALUES": { + "polars": "1.39.2", + "python": "3.14.3", + "scikit-learn": "1.8.0" + }, + "MAKE_CHUNKS": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "NORMFINDER": { + "numba": "0.64.0", + "numpy": "2.4.3", + "polars": "1.39.2", + "python": "3.14.3", + "tqdm": "4.67.3" + }, + "PLATFORM": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "QUANTILE_NORMALISATION": { + "polars": "1.39.2", + "python": "3.14.3", + "scikit-learn": "1.8.0" + }, + "RATIO_STANDARD_VARIATION": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "TOO_MANY_MISSING_VALUES": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "TOO_MANY_ZEROS": { + "polars": "1.39.2", + "python": "3.14.3" + }, + "Workflow": { + "nf-core/stableexpression": "v1.0.0" + } + }, + [ + "aggregated", + "aggregated/all_genes_summary.csv", + "aggregated/custom_content_multiqc_config.yaml", + "aggregated/section_1.most_stable_genes_summary.csv", + "aggregated/section_1.most_stable_genes_transposed_counts.csv", + "aggregated/section_10.most_stable_genes_summary.csv", + "aggregated/section_10.most_stable_genes_transposed_counts.csv", + "aggregated/section_11.most_stable_genes_summary.csv", + "aggregated/section_11.most_stable_genes_transposed_counts.csv", + "aggregated/section_12.most_stable_genes_summary.csv", + "aggregated/section_12.most_stable_genes_transposed_counts.csv", + "aggregated/section_13.most_stable_genes_summary.csv", + "aggregated/section_13.most_stable_genes_transposed_counts.csv", + "aggregated/section_14.most_stable_genes_summary.csv", + "aggregated/section_14.most_stable_genes_transposed_counts.csv", + "aggregated/section_15.most_stable_genes_summary.csv", + "aggregated/section_15.most_stable_genes_transposed_counts.csv", + "aggregated/section_16.most_stable_genes_summary.csv", + "aggregated/section_16.most_stable_genes_transposed_counts.csv", + "aggregated/section_17.most_stable_genes_summary.csv", + "aggregated/section_17.most_stable_genes_transposed_counts.csv", + "aggregated/section_18.most_stable_genes_summary.csv", + "aggregated/section_18.most_stable_genes_transposed_counts.csv", + "aggregated/section_19.most_stable_genes_summary.csv", + "aggregated/section_19.most_stable_genes_transposed_counts.csv", + "aggregated/section_2.most_stable_genes_summary.csv", + "aggregated/section_2.most_stable_genes_transposed_counts.csv", + "aggregated/section_20.most_stable_genes_summary.csv", + "aggregated/section_20.most_stable_genes_transposed_counts.csv", + "aggregated/section_3.most_stable_genes_summary.csv", + "aggregated/section_3.most_stable_genes_transposed_counts.csv", + "aggregated/section_4.most_stable_genes_summary.csv", + "aggregated/section_4.most_stable_genes_transposed_counts.csv", + "aggregated/section_5.most_stable_genes_summary.csv", + "aggregated/section_5.most_stable_genes_transposed_counts.csv", + "aggregated/section_6.most_stable_genes_summary.csv", + "aggregated/section_6.most_stable_genes_transposed_counts.csv", + "aggregated/section_7.most_stable_genes_summary.csv", + "aggregated/section_7.most_stable_genes_transposed_counts.csv", + "aggregated/section_8.most_stable_genes_summary.csv", + "aggregated/section_8.most_stable_genes_transposed_counts.csv", + "aggregated/section_9.most_stable_genes_summary.csv", + "aggregated/section_9.most_stable_genes_transposed_counts.csv", + "dash_app", + "dash_app/app.py", + "dash_app/assets", + "dash_app/assets/style.css", + "dash_app/data", + "dash_app/data/all_counts.imputed.parquet", + "dash_app/data/all_genes_summary.csv", + "dash_app/data/whole_design.csv", + "dash_app/environment.yml", + "dash_app/src", + "dash_app/src/callbacks", + "dash_app/src/callbacks/common.py", + "dash_app/src/callbacks/genes.py", + "dash_app/src/callbacks/samples.py", + "dash_app/src/components", + "dash_app/src/components/graphs.py", + "dash_app/src/components/icons.py", + "dash_app/src/components/right_sidebar.py", + "dash_app/src/components/settings", + "dash_app/src/components/settings/genes.py", + "dash_app/src/components/settings/samples.py", + "dash_app/src/components/stores.py", + "dash_app/src/components/tables.py", + "dash_app/src/components/tooltips.py", + "dash_app/src/components/top.py", + "dash_app/src/utils", + "dash_app/src/utils/config.py", + "dash_app/src/utils/data_management.py", + "dash_app/src/utils/style.py", + "errors", + "gene_length", + "gene_length/Arabidopsis_thaliana.TAIR10.62.gff3.gz", + "gene_length/gene_transcript_lengths.csv", + "idmapping", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/gprofiler", + "idmapping/gprofiler/gene_metadata.csv", + "idmapping/gprofiler/mapped_gene_ids.csv", + "idmapping/renamed", + "idmapping/renamed/E_GEOD_51720_rnaseq.rnaseq.raw.counts.cleaned.renamed.parquet", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_gene_statistics.txt", + "multiqc/multiqc_data/multiqc_genes_section_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_10.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_11.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_12.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_13.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_14.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_15.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_16.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_17.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_18.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_19.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_2.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_3.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_4.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_5.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_6.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_7.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_8.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_9.txt", + "multiqc/multiqc_data/multiqc_id_mapping_stats.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_10.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_11.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_12.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_13.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_14.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_15.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_16.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_17.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_18.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_19.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_2.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_3.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_4.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_5.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_6.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_7.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_8.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_9.txt", + "multiqc/multiqc_data/multiqc_null_values_filter.txt", + "multiqc/multiqc_data/multiqc_ratio_nulls.txt", + "multiqc/multiqc_data/multiqc_ratio_zeros.txt", + "multiqc/multiqc_data/multiqc_skewness.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_data/multiqc_total_gene_id_occurrence_quantiles.txt", + "multiqc/multiqc_data/multiqc_zero_values_filter.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/genes_section_1.pdf", + "multiqc/multiqc_plots/pdf/normalised_expr_distrib_section_1.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/genes_section_1.png", + "multiqc/multiqc_plots/png/normalised_expr_distrib_section_1.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/genes_section_1.svg", + "multiqc/multiqc_plots/svg/normalised_expr_distrib_section_1.svg", + "multiqc/multiqc_report.html", + "normalised", + "normalised/quantile_normalised", + "normalised/quantile_normalised/E_GEOD_51720_rnaseq", + "normalised/quantile_normalised/E_GEOD_51720_rnaseq/E_GEOD_51720_rnaseq.rnaseq.raw.counts.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.quant_norm.parquet", + "normalised/tpm", + "normalised/tpm/E_GEOD_51720_rnaseq", + "normalised/tpm/E_GEOD_51720_rnaseq/E_GEOD_51720_rnaseq.rnaseq.raw.counts.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/datasets", + "public_data/expression_atlas/datasets/E_GEOD_51720_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_GEOD_51720_rnaseq.rnaseq.raw.counts.csv", + "statistics", + "statistics/id_mapping_stats.csv", + "statistics/missing_values_filter_stats.csv", + "statistics/ratio_nulls.csv", + "statistics/ratio_nulls_per_sample.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "statistics/zero_values_filter_stats.csv", + "warnings" + ], + [ + "all_genes_summary.csv:md5,643bb1aa5f128bad6f192bd2aeaa2ee6", + "custom_content_multiqc_config.yaml:md5,e048085491cb74658cf363545b1278fe", + "section_1.most_stable_genes_summary.csv:md5,911c1c687cdc308f5aecaef42d504a89", + "section_1.most_stable_genes_transposed_counts.csv:md5,849135e7f42258dd2975d74f136d23aa", + "section_10.most_stable_genes_summary.csv:md5,131be639b26c51537ec05d67258a2820", + "section_10.most_stable_genes_transposed_counts.csv:md5,aa8748ba5cab4cb2387a616326d82023", + "section_11.most_stable_genes_summary.csv:md5,6e363a28b0762735cdf575f6aec3fb54", + "section_11.most_stable_genes_transposed_counts.csv:md5,705e2b4becb685f21490948f648cee0a", + "section_12.most_stable_genes_summary.csv:md5,6e49cbc5af4a45fcd62f9a9c9d1c82ad", + "section_12.most_stable_genes_transposed_counts.csv:md5,a757cf115a30079e4dea9ebe44e587d5", + "section_13.most_stable_genes_summary.csv:md5,6765ae522f95e29af34c118c36464510", + "section_13.most_stable_genes_transposed_counts.csv:md5,578d598340aa36cf38852e06e619190a", + "section_14.most_stable_genes_summary.csv:md5,dba0d2e1803d588bbc213896ea143d56", + "section_14.most_stable_genes_transposed_counts.csv:md5,53f8590e2ddfbbc80e1e72516f5b821a", + "section_15.most_stable_genes_summary.csv:md5,1ff8d851ef7bceecb1bb96111cf42ed9", + "section_15.most_stable_genes_transposed_counts.csv:md5,2f1987c6e0327610cfaf3b5ac4b17c99", + "section_16.most_stable_genes_summary.csv:md5,22a67ebb023441a8428b8d9277c237f7", + "section_16.most_stable_genes_transposed_counts.csv:md5,467d69d7581c7b2d008b6a69004775f2", + "section_17.most_stable_genes_summary.csv:md5,e5cbc51cfe86c7b2225804410d30665b", + "section_17.most_stable_genes_transposed_counts.csv:md5,d758e3e9e4274ff7815af4fa9f84154d", + "section_18.most_stable_genes_summary.csv:md5,828dd90d5c39cf1b714e2804dd7b8d84", + "section_18.most_stable_genes_transposed_counts.csv:md5,2f633511784b3babc159c4ecfed76fa2", + "section_19.most_stable_genes_summary.csv:md5,b32ed5d4a50671ac38a4a616dc81b2b9", + "section_19.most_stable_genes_transposed_counts.csv:md5,b507a8bbe8e2d3852e7952e932917751", + "section_2.most_stable_genes_summary.csv:md5,439d0e60a30d7232508e695a210053c5", + "section_2.most_stable_genes_transposed_counts.csv:md5,a1803a9577616d7a098ad1567817cb20", + "section_20.most_stable_genes_summary.csv:md5,0d82b5d34b415947bdda4d016fa52f71", + "section_20.most_stable_genes_transposed_counts.csv:md5,3a1ae07c51acb0a1672e210a8a137121", + "section_3.most_stable_genes_summary.csv:md5,1ade5c406fe691b48a7f6b56b4778971", + "section_3.most_stable_genes_transposed_counts.csv:md5,71d9e444731c709189ed569ada9be4c1", + "section_4.most_stable_genes_summary.csv:md5,aa1216a538b2723ac246fd336b8a3fcb", + "section_4.most_stable_genes_transposed_counts.csv:md5,8bd766e3232e4f7591cba721cbf305dc", + "section_5.most_stable_genes_summary.csv:md5,84e099dbe057240baa5542e035214362", + "section_5.most_stable_genes_transposed_counts.csv:md5,180382fc6c81bc94032fb592425d1596", + "section_6.most_stable_genes_summary.csv:md5,e455df268552dbede82debdaff7f2bb5", + "section_6.most_stable_genes_transposed_counts.csv:md5,da8bc59c611f88c51b047f6ccb50d08b", + "section_7.most_stable_genes_summary.csv:md5,ef6db8ade4ffd92d0ef872b8e4c88417", + "section_7.most_stable_genes_transposed_counts.csv:md5,b1d1db3949dd5a07ea45baf10c184d05", + "section_8.most_stable_genes_summary.csv:md5,911c809c86111dc0597a953cbfa26d62", + "section_8.most_stable_genes_transposed_counts.csv:md5,337a0e231598d45291a6a42a25c585b1", + "section_9.most_stable_genes_summary.csv:md5,cfaafcd65fffaed8169835cfc0992430", + "section_9.most_stable_genes_transposed_counts.csv:md5,cdb7220619e76d11963f1f1b08101e42", + "style.css:md5,e6ba182eaf06980dbda49920efbf6e64", + "all_genes_summary.csv:md5,643bb1aa5f128bad6f192bd2aeaa2ee6", + "whole_design.csv:md5,d3aa542c4ad07d0051a84482fe6cd81c", + "environment.yml:md5,dd081780e1f98d34b13289d019f8bb5b", + "Arabidopsis_thaliana.TAIR10.62.gff3.gz:md5,b02566c301d47461db70747b3adaa6ce", + "gene_transcript_lengths.csv:md5,06b4612031f4f300a6d67f36e7625492", + "global_gene_id_mapping.csv:md5,42491ef436cce231258c0358e1af5745", + "global_gene_metadata.csv:md5,b35e20500269d4e6787ef1a3468f16bc", + "gene_metadata.csv:md5,b35e20500269d4e6787ef1a3468f16bc", + "mapped_gene_ids.csv:md5,42491ef436cce231258c0358e1af5745", + "whole_design.csv:md5,d3aa542c4ad07d0051a84482fe6cd81c", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_gene_statistics.txt:md5,53fe105326f1a097d3437731eb4e3a8d", + "multiqc_genes_section_1.txt:md5,cb79085a6608e6dfd5a96291dcea850b", + "multiqc_genes_section_1_1.txt:md5,7308dd2805d9b530457a1eb839e1b455", + "multiqc_genes_section_1_10.txt:md5,91e61a6a01bdf35096fb02f79476dd58", + "multiqc_genes_section_1_11.txt:md5,f0b8df84a99b2d5ef557ee8896217095", + "multiqc_genes_section_1_12.txt:md5,89e8c3dcd3d970735de56ed6dd618caf", + "multiqc_genes_section_1_13.txt:md5,b7b1b4265c236ba1c8ed7358e34a6dd6", + "multiqc_genes_section_1_14.txt:md5,831514e662296f82a3f0370ae64b1503", + "multiqc_genes_section_1_15.txt:md5,92aa9a142514894d965ce5f41bee781d", + "multiqc_genes_section_1_16.txt:md5,e905ff948cccf03b24177517e39078ad", + "multiqc_genes_section_1_17.txt:md5,920067a6137cbded388b393f4a84d0bf", + "multiqc_genes_section_1_18.txt:md5,e46e3add55d144e8dc04087498b73b65", + "multiqc_genes_section_1_19.txt:md5,72e10039958b0d2667136688b35411cf", + "multiqc_genes_section_1_2.txt:md5,210eff8a16470b70dd186c52aa218512", + "multiqc_genes_section_1_3.txt:md5,15f3d0a57e714b176361689eece78b90", + "multiqc_genes_section_1_4.txt:md5,36cb183f89030a540dc51f83fe0073c4", + "multiqc_genes_section_1_5.txt:md5,6bd50c3d3040facf83fb70d3aad70caf", + "multiqc_genes_section_1_6.txt:md5,420fee370865219de09913c9eb827a49", + "multiqc_genes_section_1_7.txt:md5,fb4c14faf2e007704f1fcb21949deb2d", + "multiqc_genes_section_1_8.txt:md5,df2f893d352fc6992f8d95e18f30a1e4", + "multiqc_genes_section_1_9.txt:md5,75c27fc9730c4346074c667cc8d1c885", + "multiqc_id_mapping_stats.txt:md5,49023d9842e01da40e2c50e9659802d5", + "multiqc_normalised_expr_distrib_section_1.txt:md5,9e50c1075664481653bb278323672633", + "multiqc_normalised_expr_distrib_section_1_1.txt:md5,a1fa5d657a142abbf49fb95bf266d906", + "multiqc_normalised_expr_distrib_section_1_10.txt:md5,f89a15a3af0047f9bd0f5d01ca9ccb33", + "multiqc_normalised_expr_distrib_section_1_11.txt:md5,fead0770f22c316593d6d2353d94e9f7", + "multiqc_normalised_expr_distrib_section_1_12.txt:md5,8cacaee9d1bedf3ec8a4d66f3bab1f7f", + "multiqc_normalised_expr_distrib_section_1_13.txt:md5,2567a9943c1c49e575b4c2fe6a3a3185", + "multiqc_normalised_expr_distrib_section_1_14.txt:md5,02d40fd44721ec46f59736221500078a", + "multiqc_normalised_expr_distrib_section_1_15.txt:md5,0c89badaf4e435df8526ae8e9f4802ab", + "multiqc_normalised_expr_distrib_section_1_16.txt:md5,3fe2b8ffacda4c1f8ca761eb7a1e1086", + "multiqc_normalised_expr_distrib_section_1_17.txt:md5,c86c0cf8c3e4eab7a61979f622f126d7", + "multiqc_normalised_expr_distrib_section_1_18.txt:md5,17554bf8a45621ecdedefe2a9b79835e", + "multiqc_normalised_expr_distrib_section_1_19.txt:md5,28b90411fa811ba678f237e9ee6f20a2", + "multiqc_normalised_expr_distrib_section_1_2.txt:md5,b3876970c55302cb37f1bd8f8ca620ee", + "multiqc_normalised_expr_distrib_section_1_3.txt:md5,2ead25fe7da0f48beca784882fabb1a6", + "multiqc_normalised_expr_distrib_section_1_4.txt:md5,68245ac492b42288c310612a5e88cbe4", + "multiqc_normalised_expr_distrib_section_1_5.txt:md5,ac5f414686facdfc71016982d3824875", + "multiqc_normalised_expr_distrib_section_1_6.txt:md5,88d20ad256f42e564daf79ca8c13a1a2", + "multiqc_normalised_expr_distrib_section_1_7.txt:md5,4cb4700660dd2613194c7b62324d019b", + "multiqc_normalised_expr_distrib_section_1_8.txt:md5,43eb422269b358c59e2d31f9602b24b3", + "multiqc_normalised_expr_distrib_section_1_9.txt:md5,d444233cf608c17cfdc7cc8ebf2c2fe9", + "multiqc_null_values_filter.txt:md5,91eb32460cdebb4e08ae0b1ee559cf59", + "multiqc_ratio_nulls.txt:md5,bcf9aa423c404f2e7f8ea84735810959", + "multiqc_ratio_zeros.txt:md5,c743a773da2858b59923eff1873c26d0", + "multiqc_total_gene_id_occurrence_quantiles.txt:md5,9eb24790b7fbfee4b7c3bcff74a334db", + "multiqc_zero_values_filter.txt:md5,a9ec449705f94f15962e6ca856b87420", + "E_GEOD_51720_rnaseq.design.csv:md5,80805afb29837b6fbb73a6aa6f3a461b", + "E_GEOD_51720_rnaseq.rnaseq.raw.counts.csv:md5,07cd448196fc2fea4663bd9705da2b98", + "id_mapping_stats.csv:md5,cd17a5d4afa6b86a48adb03868d3073f", + "missing_values_filter_stats.csv:md5,cd1ab16f9c485f8e739a54344cde1aed", + "ratio_nulls.csv:md5,9c496b3b8c098a1bc17c6be7a87f2331", + "ratio_nulls_per_sample.csv:md5,9211cb6081071e8825119194faf6241f", + "ratio_zeros.csv:md5,17b7bde6ca29e11bb1e28db6b8053add", + "zero_values_filter_stats.csv:md5,766d888e41179e8a785f634b3b606bc9" + ] + ], + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-05T09:30:42.794565916" + }, + "-profile test_public_and_dataset": { + "content": [ + [ + "aggregated", + "aggregated/all_genes_summary.csv", + "aggregated/custom_content_multiqc_config.yaml", + "aggregated/section_1.most_stable_genes_summary.csv", + "aggregated/section_1.most_stable_genes_transposed_counts.csv", + "aggregated/section_10.most_stable_genes_summary.csv", + "aggregated/section_10.most_stable_genes_transposed_counts.csv", + "aggregated/section_11.most_stable_genes_summary.csv", + "aggregated/section_11.most_stable_genes_transposed_counts.csv", + "aggregated/section_12.most_stable_genes_summary.csv", + "aggregated/section_12.most_stable_genes_transposed_counts.csv", + "aggregated/section_13.most_stable_genes_summary.csv", + "aggregated/section_13.most_stable_genes_transposed_counts.csv", + "aggregated/section_14.most_stable_genes_summary.csv", + "aggregated/section_14.most_stable_genes_transposed_counts.csv", + "aggregated/section_15.most_stable_genes_summary.csv", + "aggregated/section_15.most_stable_genes_transposed_counts.csv", + "aggregated/section_16.most_stable_genes_summary.csv", + "aggregated/section_16.most_stable_genes_transposed_counts.csv", + "aggregated/section_17.most_stable_genes_summary.csv", + "aggregated/section_17.most_stable_genes_transposed_counts.csv", + "aggregated/section_18.most_stable_genes_summary.csv", + "aggregated/section_18.most_stable_genes_transposed_counts.csv", + "aggregated/section_19.most_stable_genes_summary.csv", + "aggregated/section_19.most_stable_genes_transposed_counts.csv", + "aggregated/section_2.most_stable_genes_summary.csv", + "aggregated/section_2.most_stable_genes_transposed_counts.csv", + "aggregated/section_20.most_stable_genes_summary.csv", + "aggregated/section_20.most_stable_genes_transposed_counts.csv", + "aggregated/section_3.most_stable_genes_summary.csv", + "aggregated/section_3.most_stable_genes_transposed_counts.csv", + "aggregated/section_4.most_stable_genes_summary.csv", + "aggregated/section_4.most_stable_genes_transposed_counts.csv", + "aggregated/section_5.most_stable_genes_summary.csv", + "aggregated/section_5.most_stable_genes_transposed_counts.csv", + "aggregated/section_6.most_stable_genes_summary.csv", + "aggregated/section_6.most_stable_genes_transposed_counts.csv", + "aggregated/section_7.most_stable_genes_summary.csv", + "aggregated/section_7.most_stable_genes_transposed_counts.csv", + "aggregated/section_8.most_stable_genes_summary.csv", + "aggregated/section_8.most_stable_genes_transposed_counts.csv", + "aggregated/section_9.most_stable_genes_summary.csv", + "aggregated/section_9.most_stable_genes_transposed_counts.csv", + "dash_app", + "dash_app/app.py", + "dash_app/assets", + "dash_app/assets/style.css", + "dash_app/data", + "dash_app/data/all_counts.imputed.parquet", + "dash_app/data/all_genes_summary.csv", + "dash_app/data/whole_design.csv", + "dash_app/environment.yml", + "dash_app/src", + "dash_app/src/callbacks", + "dash_app/src/callbacks/common.py", + "dash_app/src/callbacks/genes.py", + "dash_app/src/callbacks/samples.py", + "dash_app/src/components", + "dash_app/src/components/graphs.py", + "dash_app/src/components/icons.py", + "dash_app/src/components/right_sidebar.py", + "dash_app/src/components/settings", + "dash_app/src/components/settings/genes.py", + "dash_app/src/components/settings/samples.py", + "dash_app/src/components/stores.py", + "dash_app/src/components/tables.py", + "dash_app/src/components/tooltips.py", + "dash_app/src/components/top.py", + "dash_app/src/utils", + "dash_app/src/utils/config.py", + "dash_app/src/utils/data_management.py", + "dash_app/src/utils/style.py", + "errors", + "gene_length", + "gene_length/Beta_vulgaris.RefBeet-1.2.2.62.gff3.gz", + "gene_length/gene_transcript_lengths.csv", + "idmapping", + "idmapping/global_gene_id_mapping.csv", + "idmapping/global_gene_metadata.csv", + "idmapping/gprofiler", + "idmapping/gprofiler/gene_metadata.csv", + "idmapping/gprofiler/mapped_gene_ids.csv", + "idmapping/renamed", + "idmapping/renamed/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.parquet", + "idmapping/renamed/beta_vulgaris.rnaseq.raw.counts.cleaned.renamed.parquet", + "merged_datasets", + "merged_datasets/whole_design.csv", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_eatlas_all_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_eatlas_selected_experiments_metadata.txt", + "multiqc/multiqc_data/multiqc_gene_statistics.txt", + "multiqc/multiqc_data/multiqc_genes_section_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_1.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_10.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_11.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_12.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_13.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_14.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_15.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_16.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_17.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_18.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_19.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_2.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_3.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_4.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_5.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_6.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_7.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_8.txt", + "multiqc/multiqc_data/multiqc_genes_section_1_9.txt", + "multiqc/multiqc_data/multiqc_id_mapping_stats.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_1.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_10.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_11.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_12.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_13.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_14.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_15.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_16.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_17.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_18.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_19.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_2.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_3.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_4.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_5.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_6.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_7.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_8.txt", + "multiqc/multiqc_data/multiqc_normalised_expr_distrib_section_1_9.txt", + "multiqc/multiqc_data/multiqc_null_values_filter.txt", + "multiqc/multiqc_data/multiqc_ratio_nulls.txt", + "multiqc/multiqc_data/multiqc_ratio_zeros.txt", + "multiqc/multiqc_data/multiqc_skewness.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_data/multiqc_total_gene_id_occurrence_quantiles.txt", + "multiqc/multiqc_data/multiqc_zero_values_filter.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/genes_section_1.pdf", + "multiqc/multiqc_plots/pdf/normalised_expr_distrib_section_1.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/genes_section_1.png", + "multiqc/multiqc_plots/png/normalised_expr_distrib_section_1.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/genes_section_1.svg", + "multiqc/multiqc_plots/svg/normalised_expr_distrib_section_1.svg", + "multiqc/multiqc_report.html", + "normalised", + "normalised/quantile_normalised", + "normalised/quantile_normalised/E_MTAB_8187_rnaseq", + "normalised/quantile_normalised/E_MTAB_8187_rnaseq/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.quant_norm.parquet", + "normalised/tpm", + "normalised/tpm/E_MTAB_8187_rnaseq", + "normalised/tpm/E_MTAB_8187_rnaseq/E_MTAB_8187_rnaseq.rnaseq.raw.counts.cleaned.renamed.zeros_filtered.nulls_filtered.tpm.parquet", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/accessions", + "public_data/expression_atlas/accessions/accessions.txt", + "public_data/expression_atlas/accessions/selected_experiments.metadata.tsv", + "public_data/expression_atlas/accessions/species_experiments.metadata.tsv", + "public_data/expression_atlas/datasets", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv", + "statistics", + "statistics/id_mapping_stats.csv", + "statistics/missing_values_filter_stats.csv", + "statistics/ratio_nulls.csv", + "statistics/ratio_nulls_per_sample.csv", + "statistics/ratio_zeros.csv", + "statistics/skewness.csv", + "statistics/zero_values_filter_stats.csv", + "warnings" + ], + [ + "all_genes_summary.csv:md5,e3f8d59accf267c351d0a995ffc9ebf5", + "custom_content_multiqc_config.yaml:md5,e048085491cb74658cf363545b1278fe", + "section_1.most_stable_genes_summary.csv:md5,be640cd7efc6a7ac3df989b9ab9a6448", + "section_1.most_stable_genes_transposed_counts.csv:md5,8363bc69b84c68fe4ecea13b6dc70d98", + "section_10.most_stable_genes_summary.csv:md5,41c3ba1e338277e40e03c9b043059cb0", + "section_10.most_stable_genes_transposed_counts.csv:md5,4a599908cea31077650911161a4fd155", + "section_11.most_stable_genes_summary.csv:md5,136e636de09496412dc76ef7fb10c47b", + "section_11.most_stable_genes_transposed_counts.csv:md5,9aeb482d2ff0cbfaa8d29a5af4357701", + "section_12.most_stable_genes_summary.csv:md5,c27fb0df29ac4fb3bea8df3fbb6ef2b1", + "section_12.most_stable_genes_transposed_counts.csv:md5,edbe661b7c150c1a8af01c3c52ea45f7", + "section_13.most_stable_genes_summary.csv:md5,0395eed958d9571fae34ae29b8fe643e", + "section_13.most_stable_genes_transposed_counts.csv:md5,3ece34d50b412abddbce5da5c05f10de", + "section_14.most_stable_genes_summary.csv:md5,8677aa89331f67690330becf078260e3", + "section_14.most_stable_genes_transposed_counts.csv:md5,15840cc29d8d27881b59f19804134f97", + "section_15.most_stable_genes_summary.csv:md5,182e3a6e3a855340c50b5d2705b84142", + "section_15.most_stable_genes_transposed_counts.csv:md5,8a4c0d3018f3ed87305b4cafa8d3a7ae", + "section_16.most_stable_genes_summary.csv:md5,6c41bed8aea0f1cfa973ae7dfc93a148", + "section_16.most_stable_genes_transposed_counts.csv:md5,e3196137992a40340e20cb46ebd5cbdd", + "section_17.most_stable_genes_summary.csv:md5,f4aaec1b2af2e89bf26c156b907097e8", + "section_17.most_stable_genes_transposed_counts.csv:md5,af06eab6bc04fc315544fcd0176da4cd", + "section_18.most_stable_genes_summary.csv:md5,5f21148626ed40d0d64b393babcf160d", + "section_18.most_stable_genes_transposed_counts.csv:md5,29fc2248ad428cb3ac8898b0a5471eec", + "section_19.most_stable_genes_summary.csv:md5,5acc2a1b1980004f88c0584a8cf0784e", + "section_19.most_stable_genes_transposed_counts.csv:md5,9586c452f93c486ed667fb343af3b13c", + "section_2.most_stable_genes_summary.csv:md5,95e986dad2f0232070aa47079b6465c1", + "section_2.most_stable_genes_transposed_counts.csv:md5,b22984d5b00ee4540fca59b5585a0a88", + "section_20.most_stable_genes_summary.csv:md5,9d9c5cd95d1d1a350a8d1f2ce363f882", + "section_20.most_stable_genes_transposed_counts.csv:md5,e9f4187bdc7079c3130bdff1e4ebf575", + "section_3.most_stable_genes_summary.csv:md5,7825d8dbcfd1c4e5a4e4ca42268d4ea8", + "section_3.most_stable_genes_transposed_counts.csv:md5,77d118556692fe285590489db96f47d0", + "section_4.most_stable_genes_summary.csv:md5,221b0d42881ada7cd7fcca65cdc827a4", + "section_4.most_stable_genes_transposed_counts.csv:md5,5d1d9ebe8151765fb37176c86f3c7812", + "section_5.most_stable_genes_summary.csv:md5,a3c3edb5fd3cf852185531a4adcd9fd9", + "section_5.most_stable_genes_transposed_counts.csv:md5,51289b18ac41641114892519d2e494a6", + "section_6.most_stable_genes_summary.csv:md5,5a7baf9eadb389cc234808d56ee6fdfe", + "section_6.most_stable_genes_transposed_counts.csv:md5,bb1cddda97df3915d2aad5973e1c8a16", + "section_7.most_stable_genes_summary.csv:md5,a1ed63a57844d1bce998eea23714f071", + "section_7.most_stable_genes_transposed_counts.csv:md5,82c59e866871569fbde316efea5e7ea3", + "section_8.most_stable_genes_summary.csv:md5,40673407e734107f0cebf2045023155a", + "section_8.most_stable_genes_transposed_counts.csv:md5,0bfb8031fc91115a61a57113a6df5c4d", + "section_9.most_stable_genes_summary.csv:md5,7178bb75b1733f71d0aeba2a09750b3b", + "section_9.most_stable_genes_transposed_counts.csv:md5,b02e0d31ed2c0fa925060893062c07a7", + "style.css:md5,e6ba182eaf06980dbda49920efbf6e64", + "all_genes_summary.csv:md5,e3f8d59accf267c351d0a995ffc9ebf5", + "whole_design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "environment.yml:md5,dd081780e1f98d34b13289d019f8bb5b", + "Beta_vulgaris.RefBeet-1.2.2.62.gff3.gz:md5,6f2c45809441c8776e6578000db2b0e4", + "gene_transcript_lengths.csv:md5,458c7dfd3598bdcbcb6ceb76ccba189f", + "global_gene_id_mapping.csv:md5,7eecbd2d88adaf5f213f238a72d28b99", + "global_gene_metadata.csv:md5,cc8d4afdbaf03cd39a4e10f2a9040b7e", + "gene_metadata.csv:md5,cc8d4afdbaf03cd39a4e10f2a9040b7e", + "mapped_gene_ids.csv:md5,7eecbd2d88adaf5f213f238a72d28b99", + "whole_design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_eatlas_all_experiments_metadata.txt:md5,6ea661253a55d41687e32fea72961063", + "multiqc_eatlas_selected_experiments_metadata.txt:md5,8b7643e0ef8eaaa3fa72f7103fd7ccee", + "multiqc_gene_statistics.txt:md5,d7750cb95663a63219dcec94e03d7af1", + "multiqc_genes_section_1.txt:md5,f310a16068d5e76713497e2d3824cf2d", + "multiqc_genes_section_1_1.txt:md5,d68c3cce20e06aaf226e88e0e52184b3", + "multiqc_genes_section_1_10.txt:md5,304bd44c0867a1419e7b48e5bb6dff05", + "multiqc_genes_section_1_11.txt:md5,807ad09f10e257546f18e5fb052511e9", + "multiqc_genes_section_1_12.txt:md5,e3d5acc5a292639bc3a1b1b5e7f5a04b", + "multiqc_genes_section_1_13.txt:md5,7dd72d333b12fc101f4a5b555e09d49a", + "multiqc_genes_section_1_14.txt:md5,59d0addf52e85cdf7d0163721c29c095", + "multiqc_genes_section_1_15.txt:md5,b10474b0ad8cd3cdf21dbe8dc4fd3676", + "multiqc_genes_section_1_16.txt:md5,6f038b7c99db654f2d749da25f7c213b", + "multiqc_genes_section_1_17.txt:md5,9f9f97f85d6605978b286942ac69ba2c", + "multiqc_genes_section_1_18.txt:md5,ab6c6e6e1a658ba92baa6dd2b68f56bf", + "multiqc_genes_section_1_19.txt:md5,5d4910983359e122e07fdbe2aeda10f7", + "multiqc_genes_section_1_2.txt:md5,89b5e91c54815bd340411210fb7b86a7", + "multiqc_genes_section_1_3.txt:md5,94130719e096ffd035a155aa59b4bdd0", + "multiqc_genes_section_1_4.txt:md5,ba0275140b46c0c2d2690304bfd008d8", + "multiqc_genes_section_1_5.txt:md5,fcdcb0618858bf79586f679f4834f902", + "multiqc_genes_section_1_6.txt:md5,9cf7cebccab8b0073cad3d43d4d2ef92", + "multiqc_genes_section_1_7.txt:md5,d440bc9cce034ba82dd0d9f3387f9094", + "multiqc_genes_section_1_8.txt:md5,dc1f5de798343036301a059b545a378f", + "multiqc_genes_section_1_9.txt:md5,e9402e81e8c32c8a6b4015c4a55962f0", + "multiqc_id_mapping_stats.txt:md5,d7c6d500c8ea91c32da4980b5557d15e", + "multiqc_normalised_expr_distrib_section_1.txt:md5,fe7c9f8eff636a38deee18a05e17ed4d", + "multiqc_normalised_expr_distrib_section_1_1.txt:md5,7578a930f8750ecb56e892a54211e28f", + "multiqc_normalised_expr_distrib_section_1_10.txt:md5,696c5b24d54057e4738bbd0b351c5d28", + "multiqc_normalised_expr_distrib_section_1_11.txt:md5,94ef2626cd23a3395ba0f53be43b529e", + "multiqc_normalised_expr_distrib_section_1_12.txt:md5,cf62d3846d7d00b438719e75551bd3fa", + "multiqc_normalised_expr_distrib_section_1_13.txt:md5,825766b14187d801ae2284dffd562ac4", + "multiqc_normalised_expr_distrib_section_1_14.txt:md5,b18a4df24ed61f0315d41d4cddfd6539", + "multiqc_normalised_expr_distrib_section_1_15.txt:md5,4d99b3d87c9a25b18fa5ed2061dfb71c", + "multiqc_normalised_expr_distrib_section_1_16.txt:md5,82305a3ca8a54e44a558d0c83dfca9f3", + "multiqc_normalised_expr_distrib_section_1_17.txt:md5,adf99bc87dd29499a1bfc50c3c26488c", + "multiqc_normalised_expr_distrib_section_1_18.txt:md5,8b64cbab2e0cca85575b18b41f973aa5", + "multiqc_normalised_expr_distrib_section_1_19.txt:md5,4544499f66cd9de554f2d26944028cd5", + "multiqc_normalised_expr_distrib_section_1_2.txt:md5,d74f1b40545293b2dba02a0ff167119d", + "multiqc_normalised_expr_distrib_section_1_3.txt:md5,e5701cd16921b4ce657ac131418e04d1", + "multiqc_normalised_expr_distrib_section_1_4.txt:md5,fd093b2d0d535ff16ba846bde129f690", + "multiqc_normalised_expr_distrib_section_1_5.txt:md5,4dbddb8d44680d3cc45a3053c510ca2d", + "multiqc_normalised_expr_distrib_section_1_6.txt:md5,497c20bb2f2d2c03595c897f30775411", + "multiqc_normalised_expr_distrib_section_1_7.txt:md5,5c3fb8ff5e1b90d0a9904712204fc36d", + "multiqc_normalised_expr_distrib_section_1_8.txt:md5,9e5d9c6fb87d348a893bfed6b24f01ce", + "multiqc_normalised_expr_distrib_section_1_9.txt:md5,6a40889210cec540d4b3a2e903454003", + "multiqc_null_values_filter.txt:md5,88b2d9e16cd8ab52f58a48fd5d915b8c", + "multiqc_ratio_nulls.txt:md5,c9ac04a67937c7bacfebc33fcd50aab1", + "multiqc_ratio_zeros.txt:md5,9f50cd64ea4afe3723c7e222182981f6", + "multiqc_total_gene_id_occurrence_quantiles.txt:md5,497b807412eb4478e97ff0c50846c9ce", + "multiqc_zero_values_filter.txt:md5,4082d32f92221ed686e79910c6d2f6b3", + "accessions.txt:md5,76e5e3af7c72eac7a1993a2bd75b4d1a", + "selected_experiments.metadata.tsv:md5,cf220f0d0aab141abf220c856430f2f2", + "species_experiments.metadata.tsv:md5,7c354b570fd393d913bdf1fc53db1db8", + "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1", + "id_mapping_stats.csv:md5,dc2d9d7f34e570411c8cf5885b447719", + "missing_values_filter_stats.csv:md5,7db5e238928f520d761bd4792334304b", + "ratio_nulls.csv:md5,62625b0e4f7f36a59dfe077a4c709a94", + "ratio_nulls_per_sample.csv:md5,be115e6d6c5ed7b7206891ebaa0f7a67", + "ratio_zeros.csv:md5,96bbe4bd2d4c29ab5701588132af9684", + "zero_values_filter_stats.csv:md5,17fc6d525450d34445bf9cc25defe18a" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-16T11:58:29.889348049" + }, + "-profile test_download_only": { + "content": [ + { + "EXPRESSION_ATLAS": { + "ExpressionAtlas": "1.34.0", + "R": "4.4.3 (2025-02-28)", + "httpx": "0.28.1", + "nltk": "3.9.2", + "pandas": "3.0.1", + "python": "3.14.3", + "pyyaml": "6.0.3" + }, + "Workflow": { + "nf-core/stableexpression": "v1.0.0" + } + }, + [ + "errors", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_stableexpression_software_mqc_versions.yml", + "public_data", + "public_data/expression_atlas", + "public_data/expression_atlas/accessions", + "public_data/expression_atlas/accessions/accessions.txt", + "public_data/expression_atlas/accessions/selected_experiments.metadata.tsv", + "public_data/expression_atlas/accessions/species_experiments.metadata.tsv", + "public_data/expression_atlas/datasets", + "public_data/expression_atlas/datasets/E_MTAB_5309_rnaseq.design.csv", + "public_data/expression_atlas/datasets/E_MTAB_5309_rnaseq.rnaseq.raw.counts.csv", + "statistics", + "warnings" + ], + [ + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "accessions.txt:md5,f43aeb39de22137f546d2edf66d51228", + "selected_experiments.metadata.tsv:md5,d8254b794ee7a57ae6e31a72e8e2d822", + "species_experiments.metadata.tsv:md5,d8254b794ee7a57ae6e31a72e8e2d822", + "E_MTAB_5309_rnaseq.design.csv:md5,2556c1fd31b15eda7e4bbe042c83cc39", + "E_MTAB_5309_rnaseq.rnaseq.raw.counts.csv:md5,5c45a59ce7fbc59e5784f40dfb8c3b71" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-16T11:59:27.717850311" + } +} \ No newline at end of file diff --git a/tests/modules/local/aggregate_results/main.nf.test b/tests/modules/local/aggregate_results/main.nf.test new file mode 100644 index 00000000..27bb47d4 --- /dev/null +++ b/tests/modules/local/aggregate_results/main.nf.test @@ -0,0 +1,154 @@ +nextflow_process { + + name "Test Process AGGREGATE_RESULTS" + script "modules/local/aggregate_results/main.nf" + process "AGGREGATE_RESULTS" + tag "aggregate_results" + + test("Without microarray") { + + when { + process { + """ + input[0] = file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = [ + file( '$projectDir/tests/test_data/base_statistics/output/section_1.stats_with_scores.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/base_statistics/output/section_2.stats_with_scores.csv', checkIfExists: true) + ] + input[2] = [ file( '$projectDir/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv', checkIfExists: true) ] + input[3] = [] + input[4] = file( '$projectDir/tests/test_data/aggregate_results/metadata.csv', checkIfExists: true) + input[5] = file( '$projectDir/tests/test_data/aggregate_results/mapping.csv', checkIfExists: true) + input[6] = file( '$projectDir/assets/multiqc_config.custom_content.template.yaml', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("With microarray") { + + when { + process { + """ + input[0] = file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = [ + file( '$projectDir/tests/test_data/base_statistics/output/section_1.stats_with_scores.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/base_statistics/output/section_2.stats_with_scores.csv', checkIfExists: true) + ] + input[2] = [ + file( '$projectDir/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/aggregate_results/microarray_stats_all_genes.csv', checkIfExists: true) + ] + input[3] = [] + input[4] = file( '$projectDir/tests/test_data/aggregate_results/metadata.csv', checkIfExists: true) + input[5] = file( '$projectDir/tests/test_data/aggregate_results/mapping.csv', checkIfExists: true) + input[6] = file( '$projectDir/assets/multiqc_config.custom_content.template.yaml', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("With valid target genes") { + + when { + process { + """ + input[0] = file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = [ + file( '$projectDir/tests/test_data/base_statistics/output/section_1.stats_with_scores.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/base_statistics/output/section_2.stats_with_scores.csv', checkIfExists: true) + ] + input[2] = [ + file( '$projectDir/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/aggregate_results/microarray_stats_all_genes.csv', checkIfExists: true) + ] + input[3] = ["ENSRNA049454747", "ENSRNA049434246"] + input[4] = file( '$projectDir/tests/test_data/aggregate_results/metadata.csv', checkIfExists: true) + input[5] = file( '$projectDir/tests/test_data/aggregate_results/mapping.csv', checkIfExists: true) + input[6] = file( '$projectDir/assets/multiqc_config.custom_content.template.yaml', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("One invalid target gene") { + + when { + process { + """ + input[0] = file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = [ + file( '$projectDir/tests/test_data/base_statistics/output/section_1.stats_with_scores.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/base_statistics/output/section_2.stats_with_scores.csv', checkIfExists: true) + ] + input[2] = [ + file( '$projectDir/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/aggregate_results/microarray_stats_all_genes.csv', checkIfExists: true) + ] + input[3] = ["ENSRNA049454747", "UNKNOWNGENEID1234"] + input[4] = file( '$projectDir/tests/test_data/aggregate_results/metadata.csv', checkIfExists: true) + input[5] = file( '$projectDir/tests/test_data/aggregate_results/mapping.csv', checkIfExists: true) + input[6] = file( '$projectDir/assets/multiqc_config.custom_content.template.yaml', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("One section") { + + when { + process { + """ + input[0] = file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = file( '$projectDir/tests/test_data/base_statistics/output/section_1.stats_with_scores.csv', checkIfExists: true) + input[2] = [ file( '$projectDir/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv', checkIfExists: true) ] + input[3] = [] + input[4] = file( '$projectDir/tests/test_data/aggregate_results/metadata.csv', checkIfExists: true) + input[5] = file( '$projectDir/tests/test_data/aggregate_results/mapping.csv', checkIfExists: true) + input[6] = file( '$projectDir/assets/multiqc_config.custom_content.template.yaml', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/aggregate_results/main.nf.test.snap b/tests/modules/local/aggregate_results/main.nf.test.snap new file mode 100644 index 00000000..bfa2a9aa --- /dev/null +++ b/tests/modules/local/aggregate_results/main.nf.test.snap @@ -0,0 +1,330 @@ +{ + "With valid target genes": { + "content": [ + { + "0": [ + "all_genes_summary.csv:md5,67da835eca8de21309e7b3ec0f6a31f7" + ], + "1": [ + [ + "section_1.most_stable_genes_summary.csv:md5,a4c693ec5ae3f4e0e5313811dd96fa21", + "section_2.most_stable_genes_summary.csv:md5,1c33432a2576231c821e52424113a65b" + ] + ], + "2": [ + [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd", + "section_2.most_stable_genes_transposed_counts.csv:md5,3ffbb8370e2bdd0c1867610a51405260" + ] + ], + "3": [ + "custom_content_multiqc_config.yaml:md5,edf372668919bebe05783bc16995c5c4" + ], + "4": [ + [ + "AGGREGATE_RESULTS", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "AGGREGATE_RESULTS", + "polars", + "1.39.2" + ] + ], + "6": [ + [ + "AGGREGATE_RESULTS", + "pyyaml", + "6.0.3" + ] + ], + "all_genes_summary": [ + "all_genes_summary.csv:md5,67da835eca8de21309e7b3ec0f6a31f7" + ], + "custom_content_multiqc_config": [ + "custom_content_multiqc_config.yaml:md5,edf372668919bebe05783bc16995c5c4" + ], + "most_stable_genes_summary": [ + [ + "section_1.most_stable_genes_summary.csv:md5,a4c693ec5ae3f4e0e5313811dd96fa21", + "section_2.most_stable_genes_summary.csv:md5,1c33432a2576231c821e52424113a65b" + ] + ], + "most_stable_genes_transposed_counts_filtered": [ + [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd", + "section_2.most_stable_genes_transposed_counts.csv:md5,3ffbb8370e2bdd0c1867610a51405260" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-04T09:38:02.365611798" + }, + "Without microarray": { + "content": [ + { + "0": [ + "all_genes_summary.csv:md5,62a7b6ba136e4e2f7ab954386a6fbe5e" + ], + "1": [ + [ + "section_1.most_stable_genes_summary.csv:md5,8f75c2b1041d3cea08f13dfa05378a78", + "section_2.most_stable_genes_summary.csv:md5,edc6b56e2f4710c490906cd8c9a54790" + ] + ], + "2": [ + [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd", + "section_2.most_stable_genes_transposed_counts.csv:md5,3ffbb8370e2bdd0c1867610a51405260" + ] + ], + "3": [ + "custom_content_multiqc_config.yaml:md5,3b4d962847a26bdc7c0fa34c4fcff168" + ], + "4": [ + [ + "AGGREGATE_RESULTS", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "AGGREGATE_RESULTS", + "polars", + "1.39.2" + ] + ], + "6": [ + [ + "AGGREGATE_RESULTS", + "pyyaml", + "6.0.3" + ] + ], + "all_genes_summary": [ + "all_genes_summary.csv:md5,62a7b6ba136e4e2f7ab954386a6fbe5e" + ], + "custom_content_multiqc_config": [ + "custom_content_multiqc_config.yaml:md5,3b4d962847a26bdc7c0fa34c4fcff168" + ], + "most_stable_genes_summary": [ + [ + "section_1.most_stable_genes_summary.csv:md5,8f75c2b1041d3cea08f13dfa05378a78", + "section_2.most_stable_genes_summary.csv:md5,edc6b56e2f4710c490906cd8c9a54790" + ] + ], + "most_stable_genes_transposed_counts_filtered": [ + [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd", + "section_2.most_stable_genes_transposed_counts.csv:md5,3ffbb8370e2bdd0c1867610a51405260" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T14:06:37.615799808" + }, + "One invalid target gene": { + "content": [ + { + "0": [ + "all_genes_summary.csv:md5,67da835eca8de21309e7b3ec0f6a31f7" + ], + "1": [ + [ + "section_1.most_stable_genes_summary.csv:md5,a4c693ec5ae3f4e0e5313811dd96fa21", + "section_2.most_stable_genes_summary.csv:md5,1c33432a2576231c821e52424113a65b" + ] + ], + "2": [ + [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd", + "section_2.most_stable_genes_transposed_counts.csv:md5,3ffbb8370e2bdd0c1867610a51405260" + ] + ], + "3": [ + "custom_content_multiqc_config.yaml:md5,401b9b35a47e29a8dfac3ca7700e26bd" + ], + "4": [ + [ + "AGGREGATE_RESULTS", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "AGGREGATE_RESULTS", + "polars", + "1.39.2" + ] + ], + "6": [ + [ + "AGGREGATE_RESULTS", + "pyyaml", + "6.0.3" + ] + ], + "all_genes_summary": [ + "all_genes_summary.csv:md5,67da835eca8de21309e7b3ec0f6a31f7" + ], + "custom_content_multiqc_config": [ + "custom_content_multiqc_config.yaml:md5,401b9b35a47e29a8dfac3ca7700e26bd" + ], + "most_stable_genes_summary": [ + [ + "section_1.most_stable_genes_summary.csv:md5,a4c693ec5ae3f4e0e5313811dd96fa21", + "section_2.most_stable_genes_summary.csv:md5,1c33432a2576231c821e52424113a65b" + ] + ], + "most_stable_genes_transposed_counts_filtered": [ + [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd", + "section_2.most_stable_genes_transposed_counts.csv:md5,3ffbb8370e2bdd0c1867610a51405260" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T14:47:07.501875225" + }, + "One section": { + "content": [ + { + "0": [ + "all_genes_summary.csv:md5,d2d279f4c5243b3af01130ca04b5603d" + ], + "1": [ + "section_1.most_stable_genes_summary.csv:md5,8f75c2b1041d3cea08f13dfa05378a78" + ], + "2": [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd" + ], + "3": [ + "custom_content_multiqc_config.yaml:md5,4941e220852b2c814302f508cf5837cd" + ], + "4": [ + [ + "AGGREGATE_RESULTS", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "AGGREGATE_RESULTS", + "polars", + "1.39.2" + ] + ], + "6": [ + [ + "AGGREGATE_RESULTS", + "pyyaml", + "6.0.3" + ] + ], + "all_genes_summary": [ + "all_genes_summary.csv:md5,d2d279f4c5243b3af01130ca04b5603d" + ], + "custom_content_multiqc_config": [ + "custom_content_multiqc_config.yaml:md5,4941e220852b2c814302f508cf5837cd" + ], + "most_stable_genes_summary": [ + "section_1.most_stable_genes_summary.csv:md5,8f75c2b1041d3cea08f13dfa05378a78" + ], + "most_stable_genes_transposed_counts_filtered": [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T14:47:13.474058057" + }, + "With microarray": { + "content": [ + { + "0": [ + "all_genes_summary.csv:md5,67da835eca8de21309e7b3ec0f6a31f7" + ], + "1": [ + [ + "section_1.most_stable_genes_summary.csv:md5,a4c693ec5ae3f4e0e5313811dd96fa21", + "section_2.most_stable_genes_summary.csv:md5,1c33432a2576231c821e52424113a65b" + ] + ], + "2": [ + [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd", + "section_2.most_stable_genes_transposed_counts.csv:md5,3ffbb8370e2bdd0c1867610a51405260" + ] + ], + "3": [ + "custom_content_multiqc_config.yaml:md5,3b4d962847a26bdc7c0fa34c4fcff168" + ], + "4": [ + [ + "AGGREGATE_RESULTS", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "AGGREGATE_RESULTS", + "polars", + "1.39.2" + ] + ], + "6": [ + [ + "AGGREGATE_RESULTS", + "pyyaml", + "6.0.3" + ] + ], + "all_genes_summary": [ + "all_genes_summary.csv:md5,67da835eca8de21309e7b3ec0f6a31f7" + ], + "custom_content_multiqc_config": [ + "custom_content_multiqc_config.yaml:md5,3b4d962847a26bdc7c0fa34c4fcff168" + ], + "most_stable_genes_summary": [ + [ + "section_1.most_stable_genes_summary.csv:md5,a4c693ec5ae3f4e0e5313811dd96fa21", + "section_2.most_stable_genes_summary.csv:md5,1c33432a2576231c821e52424113a65b" + ] + ], + "most_stable_genes_transposed_counts_filtered": [ + [ + "section_1.most_stable_genes_transposed_counts.csv:md5,30f84570c2104f7cfac4289d583b68cd", + "section_2.most_stable_genes_transposed_counts.csv:md5,3ffbb8370e2bdd0c1867610a51405260" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T14:46:55.231695582" + } +} \ No newline at end of file diff --git a/tests/modules/local/compute_dataset_statistics/main.nf.test b/tests/modules/local/compute_dataset_statistics/main.nf.test new file mode 100644 index 00000000..d0e815b7 --- /dev/null +++ b/tests/modules/local/compute_dataset_statistics/main.nf.test @@ -0,0 +1,32 @@ +nextflow_process { + + name "Test Process COMPUTE_DATASET_STATISTICS" + script "modules/local/compute_dataset_statistics/main.nf" + process "COMPUTE_DATASET_STATISTICS" + tag "dataset_stats" + + /* + TODO: see why this test works locally, even with act, but fails in CI + test("Should not fail") { + + when { + process { + """ + input[0] = [ + [dataset: 'test'], + file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + */ + +} diff --git a/tests/modules/local/compute_dataset_statistics/main.nf.test.snap b/tests/modules/local/compute_dataset_statistics/main.nf.test.snap new file mode 100644 index 00000000..f0454e06 --- /dev/null +++ b/tests/modules/local/compute_dataset_statistics/main.nf.test.snap @@ -0,0 +1,33 @@ +{ + "Should not fail": { + "content": [ + { + "0": [ + [ + "test", + "skewness.txt:md5,0503443761b306e254ac1c0075ea267e" + ] + ], + "1": [ + [ + "COMPUTE_DATASET_STATISTICS", + "python", + "3.12.8" + ] + ], + "2": [ + [ + "COMPUTE_DATASET_STATISTICS", + "polars", + "1.37.1" + ] + ] + } + ], + "timestamp": "2026-04-02T14:11:44.847136183", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/compute_gene_statistics/main.nf.test b/tests/modules/local/compute_gene_statistics/main.nf.test new file mode 100644 index 00000000..9b126a91 --- /dev/null +++ b/tests/modules/local/compute_gene_statistics/main.nf.test @@ -0,0 +1,83 @@ +nextflow_process { + + name "Test Process COMPUTE_GENE_STATISTICS" + script "modules/local/compute_gene_statistics/main.nf" + process "COMPUTE_GENE_STATISTICS" + tag "gene_stats" + + test("No platform") { + + when { + process { + """ + input[0] = [ + [ platform: 'all' ], + file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true), + file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + ] + input[1] = file( '$projectDir/tests/test_data/compute_gene_statistics/input/ratio_nulls_per_sample.csv', checkIfExists: true) + input[2] = 0.2 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("RNAseq platform") { + + when { + process { + """ + input[0] = [ + [ platform: 'rnaseq' ], + file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true), + file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + ] + input[1] = file( '$projectDir/tests/test_data/compute_gene_statistics/input/ratio_nulls_per_sample.csv', checkIfExists: true) + input[2] = 0.2 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("No imputed values") { + + when { + process { + """ + input[0] = [ + [ platform: 'rnaseq' ], + file( '$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true), + [] + ] + input[1] = file( '$projectDir/tests/test_data/compute_gene_statistics/input/ratio_nulls_per_sample.csv', checkIfExists: true) + input[2] = 0.2 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/compute_gene_statistics/main.nf.test.snap b/tests/modules/local/compute_gene_statistics/main.nf.test.snap new file mode 100644 index 00000000..0e7756f2 --- /dev/null +++ b/tests/modules/local/compute_gene_statistics/main.nf.test.snap @@ -0,0 +1,95 @@ +{ + "No imputed values": { + "content": [ + { + "0": [ + "rnaseq.stats_all_genes.csv:md5,e2a15d08a3ada8daba6d5b834dbe1de7" + ], + "1": [ + [ + "COMPUTE_GENE_STATISTICS", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "COMPUTE_GENE_STATISTICS", + "polars", + "1.39.2" + ] + ], + "stats": [ + "rnaseq.stats_all_genes.csv:md5,e2a15d08a3ada8daba6d5b834dbe1de7" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T14:48:46.011713833" + }, + "No platform": { + "content": [ + { + "0": [ + "stats_all_genes.csv:md5,42e9e52c43527e80489294a2c2dbbec0" + ], + "1": [ + [ + "COMPUTE_GENE_STATISTICS", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "COMPUTE_GENE_STATISTICS", + "polars", + "1.39.2" + ] + ], + "stats": [ + "stats_all_genes.csv:md5,42e9e52c43527e80489294a2c2dbbec0" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T14:48:33.525954126" + }, + "RNAseq platform": { + "content": [ + { + "0": [ + "rnaseq.stats_all_genes.csv:md5,e2a15d08a3ada8daba6d5b834dbe1de7" + ], + "1": [ + [ + "COMPUTE_GENE_STATISTICS", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "COMPUTE_GENE_STATISTICS", + "polars", + "1.39.2" + ] + ], + "stats": [ + "rnaseq.stats_all_genes.csv:md5,e2a15d08a3ada8daba6d5b834dbe1de7" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T14:48:39.77826003" + } +} \ No newline at end of file diff --git a/tests/modules/local/compute_stability_scores/main.nf.test b/tests/modules/local/compute_stability_scores/main.nf.test new file mode 100644 index 00000000..275fdf75 --- /dev/null +++ b/tests/modules/local/compute_stability_scores/main.nf.test @@ -0,0 +1,58 @@ +nextflow_process { + + name "Test Process COMPUTE_STABILITY_SCORES" + script "modules/local/compute_stability_scores/main.nf" + process "COMPUTE_STABILITY_SCORES" + tag "stability_scores" + + test("With Genorm") { + + when { + process { + """ + input[0] = [ + [section: "section_1"], + file( '$projectDir/tests/test_data/compute_stability_scores/input/stability_values.normfinder.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/compute_stability_scores/input/genorm.m_measures.csv', checkIfExists: true), + file( '$projectDir/tests/test_data/compute_stability_scores/input/stats_all_genes.parquet', checkIfExists: true), + ] + input[1] = "0.8,0.1,0.1,0.1" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Without Genorm") { + + when { + process { + """ + input[0] = [ + [section: "section_1"], + file( '$projectDir/tests/test_data/compute_stability_scores/input/stability_values.normfinder.csv', checkIfExists: true), + [], + file( '$projectDir/tests/test_data/compute_stability_scores/input/stats_all_genes.parquet', checkIfExists: true), + ] + input[1] = "0.8,0.1,0.1,0.1" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/compute_stability_scores/main.nf.test.snap b/tests/modules/local/compute_stability_scores/main.nf.test.snap new file mode 100644 index 00000000..5b386ca1 --- /dev/null +++ b/tests/modules/local/compute_stability_scores/main.nf.test.snap @@ -0,0 +1,64 @@ +{ + "With Genorm": { + "content": [ + { + "0": [ + "section_1.stats_with_scores.csv:md5,7b1dd3c6e4a666561ca6ebe14aae7b74" + ], + "1": [ + [ + "COMPUTE_STABILITY_SCORES", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "COMPUTE_STABILITY_SCORES", + "polars", + "1.39.2" + ] + ], + "stats_with_stability_scores": [ + "section_1.stats_with_scores.csv:md5,7b1dd3c6e4a666561ca6ebe14aae7b74" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T15:20:22.075756497" + }, + "Without Genorm": { + "content": [ + { + "0": [ + "section_1.stats_with_scores.csv:md5,bdf823d07ed6fed0313e5cf2ce1811a6" + ], + "1": [ + [ + "COMPUTE_STABILITY_SCORES", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "COMPUTE_STABILITY_SCORES", + "polars", + "1.39.2" + ] + ], + "stats_with_stability_scores": [ + "section_1.stats_with_scores.csv:md5,bdf823d07ed6fed0313e5cf2ce1811a6" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T15:20:28.206402711" + } +} \ No newline at end of file diff --git a/tests/modules/local/expressionatlas/getaccessions/main.nf.test b/tests/modules/local/expressionatlas/getaccessions/main.nf.test new file mode 100644 index 00000000..a5e4f860 --- /dev/null +++ b/tests/modules/local/expressionatlas/getaccessions/main.nf.test @@ -0,0 +1,119 @@ +nextflow_process { + + name "Test Process EXPRESSIONATLAS_GETACCESSIONS" + script "modules/local/expressionatlas/getaccessions/main.nf" + process "EXPRESSIONATLAS_GETACCESSIONS" + tag "eatlas_getaccessions" + + test("Beta vulgaris one keyword - no platform") { + + when { + + process { + """ + input[0] = "beta_vulgaris" + input[1] = "leaf" + input[2] = [] + input[3] = 100 + input[4] = 42 + """ + } + } + + then { + assert process.success + } + + } + + test("Beta vulgaris no keyword - rnaseq platform") { + + when { + + process { + """ + input[0] = "beta_vulgaris" + input[1] = "" + input[2] = "rnaseq" + input[3] = 100 + input[4] = 42 + """ + } + } + + then { + assert process.success + } + + } + + test("Beta vulgaris - no experiments left after random sampling") { + + when { + + process { + """ + input[0] = "beta_vulgaris" + input[1] = "" + input[2] = [] + input[3] = 1 + input[4] = 42 + """ + } + } + + then { + assert process.success + } + + } + + test('Solanum tuberosum two keywords - microarray') { + + when { + + process { + """ + input[0] = "solanum_tuberosum" + input[1] = "potato,phloem" + input[2] = "microarray" + input[3] = 10000 + input[4] = 42 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test('Solanum tuberosum no keyword') { + + when { + + process { + """ + input[0] = "solanum_tuberosum" + input[1] = "" + input[2] = "microarray" + input[3] = 100 + input[4] = 42 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/expressionatlas/getaccessions/main.nf.test.snap b/tests/modules/local/expressionatlas/getaccessions/main.nf.test.snap new file mode 100644 index 00000000..f784b39f --- /dev/null +++ b/tests/modules/local/expressionatlas/getaccessions/main.nf.test.snap @@ -0,0 +1,130 @@ +{ + "Solanum tuberosum two keywords - microarray": { + "content": [ + { + "0": [ + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "1": [ + "ok" + ], + "2": [ + + ], + "3": [ + "species_experiments.metadata.tsv:md5,68b329da9893e34099c7d8ad5cb9c940" + ], + "4": [ + [ + "EXPRESSIONATLAS_GETACCESSIONS", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "EXPRESSIONATLAS_GETACCESSIONS", + "httpx", + "0.28.1" + ] + ], + "6": [ + [ + "EXPRESSIONATLAS_GETACCESSIONS", + "nltk", + "3.9.2" + ] + ], + "7": [ + [ + "EXPRESSIONATLAS_GETACCESSIONS", + "pyyaml", + "6.0.3" + ] + ], + "8": [ + [ + "EXPRESSIONATLAS_GETACCESSIONS", + "pandas", + "3.0.1" + ] + ], + "accessions": [ + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "sampling_quota": [ + "ok" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-02-19T10:19:07.035607232" + }, + "Solanum tuberosum no keyword": { + "content": [ + { + "0": [ + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "1": [ + "ok" + ], + "2": [ + + ], + "3": [ + "species_experiments.metadata.tsv:md5,68b329da9893e34099c7d8ad5cb9c940" + ], + "4": [ + [ + "EXPRESSIONATLAS_GETACCESSIONS", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "EXPRESSIONATLAS_GETACCESSIONS", + "httpx", + "0.28.1" + ] + ], + "6": [ + [ + "EXPRESSIONATLAS_GETACCESSIONS", + "nltk", + "3.9.2" + ] + ], + "7": [ + [ + "EXPRESSIONATLAS_GETACCESSIONS", + "pyyaml", + "6.0.3" + ] + ], + "8": [ + [ + "EXPRESSIONATLAS_GETACCESSIONS", + "pandas", + "3.0.1" + ] + ], + "accessions": [ + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "sampling_quota": [ + "ok" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-02-19T10:19:20.628916067" + } +} \ No newline at end of file diff --git a/tests/modules/local/expressionatlas/getdata/main.nf.test b/tests/modules/local/expressionatlas/getdata/main.nf.test new file mode 100644 index 00000000..5af0b2a7 --- /dev/null +++ b/tests/modules/local/expressionatlas/getdata/main.nf.test @@ -0,0 +1,202 @@ +nextflow_process { + + name "Test Process EXPRESSIONATLAS_GETDATA" + script "modules/local/expressionatlas/getdata/main.nf" + process "EXPRESSIONATLAS_GETDATA" + tag "eatlas_getdata" + + test("Transcriptome Analysis of the potato (rnaseq)") { + + tag "getdata_potato" + + when { + + process { + """ + input[0] = "E-MTAB-552" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Transcription profiling by array of Arabidopsis mutant for fis2 (microarray)") { + + tag "getdata_arabido" + + when { + + process { + """ + input[0] = "E-TABM-1007" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Arabidopsis Geo dataset") { + + tag "getdata_arabido_geo" + + when { + + process { + """ + input[0] = "E-GEOD-62537" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Invalid accession") { + + tag "getdata_invalid" + + when { + + process { + """ + input[0] = "fake-accession" + """ + } + } + + // must be successful without any output + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert process.out.counts.size() == 0 }, + { assert process.out.design.size() == 0 } + ) + } + + } + + test("Accession not available") { + + tag "getdata_unavailable" + + when { + + process { + """ + input[0] = "E-GEOD-161565656" + """ + } + } + + // check for the absence of expected output (the error is ignored but no output is produced) + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert process.out.counts.size() == 0 }, + { assert process.out.design.size() == 0 } + ) + } + + } + + test("E-MTAB-5132") { + + tag "getdata_unavailable" + + when { + + process { + """ + input[0] = "E-MTAB-5132" + """ + } + } + + // check for the absence of expected output (the error is ignored but no output is produced) + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert process.out.counts.size() == 0 }, + { assert process.out.design.size() == 0 } + ) + } + + } + + test("E-PROT-138") { + + tag "getdata_unsupported" + + when { + + process { + """ + input[0] = "E-PROT-138" + """ + } + } + + // check for the absence of expected output (the error is ignored but no output is produced) + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert process.out.counts.size() == 0 }, + { assert process.out.design.size() == 0 } + ) + } + + } + + test("E-MTAB-3578 :: serverside error 550") { + + tag "getdata_error_550" + + when { + + process { + """ + input[0] = "E-MTAB-3578" + """ + } + } + + // check for the absence of expected output (the error is ignored but no output is produced) + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert process.out.counts.size() == 0 }, + { assert process.out.design.size() == 0 } + ) + } + + } + +} diff --git a/tests/modules/local/expressionatlas/getdata/main.nf.test.snap b/tests/modules/local/expressionatlas/getdata/main.nf.test.snap new file mode 100644 index 00000000..cc3b5be5 --- /dev/null +++ b/tests/modules/local/expressionatlas/getdata/main.nf.test.snap @@ -0,0 +1,131 @@ +{ + "Transcriptome Analysis of the potato (rnaseq)": { + "content": [ + { + "0": [ + "E_MTAB_552_rnaseq.rnaseq.raw.counts.csv:md5,830f50b60b17b62f9ca2f6a163a2879f" + ], + "1": [ + "E_MTAB_552_rnaseq.design.csv:md5,b81490696f638e90c1cf14236bb0c08c" + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + [ + "EXPRESSIONATLAS_GETDATA", + "R", + "4.4.3 (2025-02-28)" + ] + ], + "5": [ + [ + "EXPRESSIONATLAS_GETDATA", + "ExpressionAtlas", + "1.34.0" + ] + ], + "counts": [ + "E_MTAB_552_rnaseq.rnaseq.raw.counts.csv:md5,830f50b60b17b62f9ca2f6a163a2879f" + ], + "design": [ + "E_MTAB_552_rnaseq.design.csv:md5,b81490696f638e90c1cf14236bb0c08c" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-19T12:17:31.898448037" + }, + "Arabidopsis Geo dataset": { + "content": [ + { + "0": [ + "E_GEOD_62537_A_AFFY_2.microarray.normalised.counts.csv:md5,673c55171d0ccfc1d036bf43c49ae320" + ], + "1": [ + "E_GEOD_62537_A_AFFY_2.design.csv:md5,7d7dd72be4f5b326dd25a36db01ebf88" + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + [ + "EXPRESSIONATLAS_GETDATA", + "R", + "4.4.3 (2025-02-28)" + ] + ], + "5": [ + [ + "EXPRESSIONATLAS_GETDATA", + "ExpressionAtlas", + "1.34.0" + ] + ], + "counts": [ + "E_GEOD_62537_A_AFFY_2.microarray.normalised.counts.csv:md5,673c55171d0ccfc1d036bf43c49ae320" + ], + "design": [ + "E_GEOD_62537_A_AFFY_2.design.csv:md5,7d7dd72be4f5b326dd25a36db01ebf88" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-29T16:45:22.368557567" + }, + "Transcription profiling by array of Arabidopsis mutant for fis2 (microarray)": { + "content": [ + { + "0": [ + "E_TABM_1007_A_AFFY_2.microarray.normalised.counts.csv:md5,a3afe33d7eaed3339da9109bf25bb3ed" + ], + "1": [ + "E_TABM_1007_A_AFFY_2.design.csv:md5,120f63cae2193b97d7451483bdbbaab1" + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + [ + "EXPRESSIONATLAS_GETDATA", + "R", + "4.4.3 (2025-02-28)" + ] + ], + "5": [ + [ + "EXPRESSIONATLAS_GETDATA", + "ExpressionAtlas", + "1.34.0" + ] + ], + "counts": [ + "E_TABM_1007_A_AFFY_2.microarray.normalised.counts.csv:md5,a3afe33d7eaed3339da9109bf25bb3ed" + ], + "design": [ + "E_TABM_1007_A_AFFY_2.design.csv:md5,120f63cae2193b97d7451483bdbbaab1" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-19T12:17:45.546042421" + } +} \ No newline at end of file diff --git a/tests/modules/local/filter_and_rename_genes/main.nf.test b/tests/modules/local/filter_and_rename_genes/main.nf.test new file mode 100644 index 00000000..a48b2fba --- /dev/null +++ b/tests/modules/local/filter_and_rename_genes/main.nf.test @@ -0,0 +1,88 @@ +nextflow_process { + + name "Test Process FILTER_AND_RENAME_GENES" + script "modules/local/filter_and_rename_genes/main.nf" + process "FILTER_AND_RENAME_GENES" + tag "filter_and_rename_genes" + + test("Map Ensembl IDs") { + + when { + process { + """ + input[0] = channel.of( + [ + [ dataset: "test" ], + file("$projectDir/tests/test_data/idmapping/base/counts.ensembl_ids.csv", checkIfExists: true) + ] + ) + input[1] = file("$projectDir/tests/test_data/idmapping/mapped/mapped_gene_ids.csv", checkIfExists: true) + input[2] = file("$projectDir/tests/test_data/idmapping/mapped/valid_gene_ids.txt", checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("No valid gene") { + + when { + process { + """ + input[0] = channel.of( + [ + [ dataset: "test" ], + file("$projectDir/tests/test_data/idmapping/base/counts.ensembl_ids.csv", checkIfExists: true) + ] + ) + input[1] = file("$projectDir/tests/test_data/idmapping/mapped/mapped_gene_ids.csv", checkIfExists: true) + input[2] = file("$projectDir/tests/test_data/idmapping/mapped/no_valid_gene_id.txt", checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Custom mapping - TSV") { + + tag "custom_mapping_tsv" + + when { + process { + """ + input[0] = channel.of( + [ + [ dataset: "test" ], + file("$projectDir/tests/test_data/idmapping/tsv/counts.ensembl_ids.tsv", checkIfExists: true) + ] + ) + input[1] = file( "$projectDir/tests/test_data/idmapping/tsv/mapping.tsv", checkIfExists: true) + input[2] = file("$projectDir/tests/test_data/idmapping/tsv/valid_gene_ids.txt", checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/filter_and_rename_genes/main.nf.test.snap b/tests/modules/local/filter_and_rename_genes/main.nf.test.snap new file mode 100644 index 00000000..874f066c --- /dev/null +++ b/tests/modules/local/filter_and_rename_genes/main.nf.test.snap @@ -0,0 +1,156 @@ +{ + "Custom mapping - TSV": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + "test", + "failure_reason.txt:md5,0eea8256c81d0362f3f10979ab2de23e" + ] + ], + "2": [ + + ], + "3": [ + [ + "test", + "0", + "0", + "3", + "0" + ] + ], + "4": [ + [ + "FILTER_AND_RENAME_GENES", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "FILTER_AND_RENAME_GENES", + "polars", + "1.39.2" + ] + ], + "counts": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-17T09:04:45.840061804" + }, + "Map Ensembl IDs": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "counts.ensembl_ids.renamed.parquet:md5,1fe83a8ee993d02c9df18f7412d20f0f" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "test", + "2", + "1", + "1", + "0" + ] + ], + "4": [ + [ + "FILTER_AND_RENAME_GENES", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "FILTER_AND_RENAME_GENES", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "counts.ensembl_ids.renamed.parquet:md5,1fe83a8ee993d02c9df18f7412d20f0f" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-17T09:04:26.849187591" + }, + "No valid gene": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + "test", + "failure_reason.txt:md5,0eea8256c81d0362f3f10979ab2de23e" + ] + ], + "2": [ + + ], + "3": [ + [ + "test", + "0", + "0", + "3", + "0" + ] + ], + "4": [ + [ + "FILTER_AND_RENAME_GENES", + "python", + "3.14.3" + ] + ], + "5": [ + [ + "FILTER_AND_RENAME_GENES", + "polars", + "1.39.2" + ] + ], + "counts": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-17T09:04:36.329611443" + } +} \ No newline at end of file diff --git a/tests/modules/local/genorm/compute_m_measure/main.nf.test b/tests/modules/local/genorm/compute_m_measure/main.nf.test new file mode 100644 index 00000000..fbcd556c --- /dev/null +++ b/tests/modules/local/genorm/compute_m_measure/main.nf.test @@ -0,0 +1,29 @@ +nextflow_process { + + name "Test Process COMPUTE_M_MEASURE" + script "modules/local/genorm/compute_m_measure/main.nf" + process "COMPUTE_M_MEASURE" + tag "m_measure" + + test("Four initial chunk files") { + + when { + process { + """ + input[0] = [ + [section: "section_1"], + file( '$projectDir/tests/test_data/genorm/make_chunks/input/counts.parquet', checkIfExists: true), + file( '$projectDir/tests/test_data/genorm/compute_m_measure/input/std.*.parquet', checkIfExists: true).collect() + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/genorm/compute_m_measure/main.nf.test.snap b/tests/modules/local/genorm/compute_m_measure/main.nf.test.snap new file mode 100644 index 00000000..767f3dd7 --- /dev/null +++ b/tests/modules/local/genorm/compute_m_measure/main.nf.test.snap @@ -0,0 +1,43 @@ +{ + "Four initial chunk files": { + "content": [ + { + "0": [ + [ + { + "section": "section_1" + }, + "m_measures.csv:md5,2119b16fe13e2d0bc0fedc3c9d3d1733" + ] + ], + "1": [ + [ + "COMPUTE_M_MEASURE", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "COMPUTE_M_MEASURE", + "polars", + "1.39.2" + ] + ], + "m_measures": [ + [ + { + "section": "section_1" + }, + "m_measures.csv:md5,2119b16fe13e2d0bc0fedc3c9d3d1733" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T15:40:23.09370734" + } +} \ No newline at end of file diff --git a/tests/modules/local/genorm/cross_join/main.nf.test b/tests/modules/local/genorm/cross_join/main.nf.test new file mode 100644 index 00000000..3e4d4f7a --- /dev/null +++ b/tests/modules/local/genorm/cross_join/main.nf.test @@ -0,0 +1,29 @@ +nextflow_process { + + name "Test Process CROSS_JOIN" + script "modules/local/genorm/cross_join/main.nf" + process "CROSS_JOIN" + tag "cross_join" + + test("Should run without failures") { + + when { + process { + """ + input[0] = [ + [section: "section_1", index_1: 0, index_2: 1], + file( '$projectDir/tests/test_data/genorm/make_chunks/output/count_chunk.0.parquet', checkIfExists: true), + file( '$projectDir/tests/test_data/genorm/make_chunks/output/count_chunk.1.parquet', checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/genorm/cross_join/main.nf.test.snap b/tests/modules/local/genorm/cross_join/main.nf.test.snap new file mode 100644 index 00000000..50d11bc4 --- /dev/null +++ b/tests/modules/local/genorm/cross_join/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + { + "section": "section_1", + "index_1": 0, + "index_2": 1 + }, + "cross_join.0.1.parquet:md5,10d5591947a85f788dd6db61a1486f14" + ] + ], + "1": [ + [ + "CROSS_JOIN", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "CROSS_JOIN", + "polars", + "1.39.2" + ] + ], + "data": [ + [ + { + "section": "section_1", + "index_1": 0, + "index_2": 1 + }, + "cross_join.0.1.parquet:md5,10d5591947a85f788dd6db61a1486f14" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T15:40:29.248178717" + } +} \ No newline at end of file diff --git a/tests/modules/local/genorm/expression_ratio/main.nf.test b/tests/modules/local/genorm/expression_ratio/main.nf.test new file mode 100644 index 00000000..9b355b77 --- /dev/null +++ b/tests/modules/local/genorm/expression_ratio/main.nf.test @@ -0,0 +1,28 @@ +nextflow_process { + + name "Test Process EXPRESSION_RATIO" + script "modules/local/genorm/expression_ratio/main.nf" + process "EXPRESSION_RATIO" + tag "expression_ratio" + + test("Should run without failures") { + + when { + process { + """ + input[0] = [ + [section: "section_1", index_1: 0, index_2: 1], + file( '$projectDir/tests/test_data/genorm/cross_join/output/cross_join.0.1.parquet', checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/genorm/expression_ratio/main.nf.test.snap b/tests/modules/local/genorm/expression_ratio/main.nf.test.snap new file mode 100644 index 00000000..a9e492ef --- /dev/null +++ b/tests/modules/local/genorm/expression_ratio/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + { + "section": "section_1", + "index_1": 0, + "index_2": 1 + }, + "ratios.0.1.parquet:md5,dd929c967bc78a650c33eb0885544f50" + ] + ], + "1": [ + [ + "EXPRESSION_RATIO", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "EXPRESSION_RATIO", + "polars", + "1.39.2" + ] + ], + "data": [ + [ + { + "section": "section_1", + "index_1": 0, + "index_2": 1 + }, + "ratios.0.1.parquet:md5,dd929c967bc78a650c33eb0885544f50" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-01T09:41:39.459415462" + } +} \ No newline at end of file diff --git a/tests/modules/local/genorm/make_chunks/main.nf.test b/tests/modules/local/genorm/make_chunks/main.nf.test new file mode 100644 index 00000000..3a9c230f --- /dev/null +++ b/tests/modules/local/genorm/make_chunks/main.nf.test @@ -0,0 +1,28 @@ +nextflow_process { + + name "Test Process MAKE_CHUNKS" + script "modules/local/genorm/make_chunks/main.nf" + process "MAKE_CHUNKS" + tag "make_chunks" + + test("Should run without failures") { + + when { + process { + """ + input[0] = [ + [section: "section_1"], + file( '$projectDir/tests/test_data/genorm/make_chunks/input/counts.parquet', checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/genorm/make_chunks/main.nf.test.snap b/tests/modules/local/genorm/make_chunks/main.nf.test.snap new file mode 100644 index 00000000..5b2f00be --- /dev/null +++ b/tests/modules/local/genorm/make_chunks/main.nf.test.snap @@ -0,0 +1,65 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + { + "section": "section_1" + }, + [ + "count_chunk.0.parquet:md5,2b49edb51f57065edec0dbbc3b50cd03", + "count_chunk.1.parquet:md5,a229839cc11b60b51d75e69bda1b079e", + "count_chunk.2.parquet:md5,79e06a8d5438a1fd8c35bb7e861bbb2f", + "count_chunk.3.parquet:md5,b4b75fd8c257684914ea81acec63c7b2", + "count_chunk.4.parquet:md5,938d6eb757a2114fba7c37cb79917fdb", + "count_chunk.5.parquet:md5,7de0a7158eaf28de2728ad10ed68fea3", + "count_chunk.6.parquet:md5,b7bb9a8ed8578bbf661d60dc0cc43a09", + "count_chunk.7.parquet:md5,d424e46fbcab660f7994086d95d83955", + "count_chunk.8.parquet:md5,5411ffdabeda55de3d67ae8cc32e0276", + "count_chunk.9.parquet:md5,484ecc44837b0a0f3098bff5a8144853" + ] + ] + ], + "1": [ + [ + "MAKE_CHUNKS", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "MAKE_CHUNKS", + "polars", + "1.39.2" + ] + ], + "chunks": [ + [ + { + "section": "section_1" + }, + [ + "count_chunk.0.parquet:md5,2b49edb51f57065edec0dbbc3b50cd03", + "count_chunk.1.parquet:md5,a229839cc11b60b51d75e69bda1b079e", + "count_chunk.2.parquet:md5,79e06a8d5438a1fd8c35bb7e861bbb2f", + "count_chunk.3.parquet:md5,b4b75fd8c257684914ea81acec63c7b2", + "count_chunk.4.parquet:md5,938d6eb757a2114fba7c37cb79917fdb", + "count_chunk.5.parquet:md5,7de0a7158eaf28de2728ad10ed68fea3", + "count_chunk.6.parquet:md5,b7bb9a8ed8578bbf661d60dc0cc43a09", + "count_chunk.7.parquet:md5,d424e46fbcab660f7994086d95d83955", + "count_chunk.8.parquet:md5,5411ffdabeda55de3d67ae8cc32e0276", + "count_chunk.9.parquet:md5,484ecc44837b0a0f3098bff5a8144853" + ] + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T15:40:46.563584649" + } +} \ No newline at end of file diff --git a/tests/modules/local/genorm/ratio_standard_variation/main.nf.test b/tests/modules/local/genorm/ratio_standard_variation/main.nf.test new file mode 100644 index 00000000..d235f36e --- /dev/null +++ b/tests/modules/local/genorm/ratio_standard_variation/main.nf.test @@ -0,0 +1,28 @@ +nextflow_process { + + name "Test Process RATIO_STANDARD_VARIATION" + script "modules/local/genorm/ratio_standard_variation/main.nf" + process "RATIO_STANDARD_VARIATION" + tag "ratio_std" + + test("Should run without failures") { + + when { + process { + """ + input[0] = [ + [section: "section_1", index_1: 0, index_2: 1], + file( '$projectDir/tests/test_data/genorm/expression_ratio/output/ratios.0.1.parquet', checkIfExists: true) + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/genorm/ratio_standard_variation/main.nf.test.snap b/tests/modules/local/genorm/ratio_standard_variation/main.nf.test.snap new file mode 100644 index 00000000..6f29543d --- /dev/null +++ b/tests/modules/local/genorm/ratio_standard_variation/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + { + "section": "section_1", + "index_1": 0, + "index_2": 1 + }, + "std.0.1.parquet:md5,10e262fc1dff8efe522a2efcee6ccb87" + ] + ], + "1": [ + [ + "RATIO_STANDARD_VARIATION", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "RATIO_STANDARD_VARIATION", + "polars", + "1.39.2" + ] + ], + "data": [ + [ + { + "section": "section_1", + "index_1": 0, + "index_2": 1 + }, + "std.0.1.parquet:md5,10e262fc1dff8efe522a2efcee6ccb87" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-01T09:41:51.590963847" + } +} \ No newline at end of file diff --git a/tests/modules/local/geo/getaccessions/main.nf.test b/tests/modules/local/geo/getaccessions/main.nf.test new file mode 100644 index 00000000..deddb432 --- /dev/null +++ b/tests/modules/local/geo/getaccessions/main.nf.test @@ -0,0 +1,71 @@ +nextflow_process { + + name "Test Process GEO_GETACCESSIONS" + script "modules/local/geo/getaccessions/main.nf" + process "GEO_GETACCESSIONS" + tag "geo_getaccession" + + test("Beta vulgaris - exclude two accessions") { + + when { + process { + """ + input[0] = "beta_vulgaris" + input[1] = "" + input[2] = [] + input[3] = file( '$projectDir/tests/test_data/public_accessions/exclude_one_geo_accession.txt', checkIfExists: true ) + input[4] = 100 + input[5] = 42 + """ + } + } + + then { + assert process.success + } + + } + + test("Beta vulgaris - leaf / microarray") { + + when { + process { + """ + input[0] = "beta_vulgaris" + input[1] = "leaf" + input[2] = "microarray" + input[3] = [] + input[4] = 100 + input[5] = 42 + """ + } + } + + then { + assert process.success + } + + } + + test("Beta vulgaris - leaf / microarray") { + + when { + process { + """ + input[0] = "beta_vulgaris" + input[1] = "leaf" + input[2] = "microarray" + input[3] = [] + input[4] = 100 + input[5] = 42 + """ + } + } + + then { + assert process.success + } + + } + +} diff --git a/tests/modules/local/geo/getaccessions/main.nf.test.snap b/tests/modules/local/geo/getaccessions/main.nf.test.snap new file mode 100644 index 00000000..17bfedde --- /dev/null +++ b/tests/modules/local/geo/getaccessions/main.nf.test.snap @@ -0,0 +1,87 @@ +{ + "Beta vulgaris": { + "content": [ + { + "0": [ + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "1": [ + [ + "filtered_datasets.metadata.tsv:md5,4d458c56f3fc71e92feda6d4031ff17e", + "rejected_datasets.metadata.tsv:md5,b7382bbefa84d5bb60089b057e75c09b", + "selected_datasets.metadata.tsv:md5,4d458c56f3fc71e92feda6d4031ff17e", + "species_datasets.metadata.tsv:md5,c36fd625541112de75d4d4ab38ec68e5" + ] + ], + "2": [ + "selected_datasets.keywords.yaml:md5,f7726c8e3b07ed20e5572d79fb7f575e" + ], + "3": [ + [ + "GEO_GETACCESSIONS", + "python", + "3.13.7" + ] + ], + "4": [ + [ + "GEO_GETACCESSIONS", + "requests", + "2.32.5" + ] + ], + "5": [ + [ + "GEO_GETACCESSIONS", + "nltk", + "3.9.1" + ] + ], + "6": [ + [ + "GEO_GETACCESSIONS", + "pyyaml", + "6.0.2" + ] + ], + "7": [ + [ + "GEO_GETACCESSIONS", + "pandas", + "2.3.2" + ] + ], + "8": [ + [ + "GEO_GETACCESSIONS", + "xmltodict", + "0.14.2" + ] + ], + "9": [ + [ + "GEO_GETACCESSIONS", + "biopython", + "1.85" + ] + ], + "accessions": [ + "accessions.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "metadata": [ + [ + "filtered_datasets.metadata.tsv:md5,4d458c56f3fc71e92feda6d4031ff17e", + "rejected_datasets.metadata.tsv:md5,b7382bbefa84d5bb60089b057e75c09b", + "selected_datasets.metadata.tsv:md5,4d458c56f3fc71e92feda6d4031ff17e", + "species_datasets.metadata.tsv:md5,c36fd625541112de75d4d4ab38ec68e5" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.8" + }, + "timestamp": "2025-10-18T11:17:44.966423003" + } +} \ No newline at end of file diff --git a/tests/modules/local/geo/getdata/main.nf.test b/tests/modules/local/geo/getdata/main.nf.test new file mode 100644 index 00000000..d71dbf3b --- /dev/null +++ b/tests/modules/local/geo/getdata/main.nf.test @@ -0,0 +1,237 @@ +nextflow_process { + + name "Test Process GEO_GETDATA" + script "modules/local/geo/getdata/main.nf" + process "GEO_GETDATA" + tag "geo_getdata" + + /* + // TODO: see why these tests give ".command.run: No such file or directory" errors sometimes, even when running locally with act + // since this process is experimental, we can skip it for now + test("Beta vulgaris - Small RNA of sugar beet in response to drought stress") { + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE205328" + ] + input[1] = "beta vulgaris" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + test("Accession does not exist") { + + when { + + process { + """ + input[0] = [ + [ ], + "GSE568945478" + ] + input[1] = "blabla" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - Only one sample among several") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE59707" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - No data found") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE124142" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - Expression profiling by array") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE43665" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - Expression profiling by high throughput sequencing / Some raw counts found") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE59707" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - Expression profiling by high throughput sequencing / One raw count found") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE100837" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - Only series suppl data but multiple species") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE274048" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert process.out.counts.size() == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Drosophila simulans - Mismatch in suppl data colnames / design") { + + when { + + process { + """ + input[0] = [ + [ id: "test" ], + "GSE49127" + ] + input[1] = "drosophila_simulans" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.exitStatus == 0 }, + { assert process.out.counts.size() == 0 }, + { assert snapshot(process.out).match() } + ) + } + + } + */ + +} diff --git a/tests/modules/local/geo/getdata/main.nf.test.snap b/tests/modules/local/geo/getdata/main.nf.test.snap new file mode 100644 index 00000000..3eab3ef8 --- /dev/null +++ b/tests/modules/local/geo/getdata/main.nf.test.snap @@ -0,0 +1,506 @@ +{ + "Drosophila simulans - Expression profiling by high throughput sequencing / One raw count found": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:54:35.640938644", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Drosophila simulans - No data found": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:54:07.057142353", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Drosophila simulans - Mismatch in suppl data colnames / design": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:54:54.838651132", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Accession does not exist": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:53:48.164404869", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Drosophila simulans - Expression profiling by array": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:54:16.400915284", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Drosophila simulans - Expression profiling by high throughput sequencing / Some raw counts found": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:54:25.947789471", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Drosophila simulans - Only series suppl data but multiple species": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:54:45.272163295", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Drosophila simulans - Only one sample among several": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:53:57.533758257", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + }, + "Beta vulgaris - Small RNA of sugar beet in response to drought stress": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "GEO_GETDATA", + "R", + "4.5.3 (2026-03-11)" + ] + ], + "6": [ + [ + "GEO_GETDATA", + "GEOquery", + "2.78.0" + ] + ], + "7": [ + [ + "GEO_GETDATA", + "dplyr", + "1.2.0" + ] + ], + "counts": [ + + ], + "design": [ + + ], + "rejected": [ + + ] + } + ], + "timestamp": "2026-03-30T14:53:38.690525862", + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/modules/local/get_candidate_genes/main.nf.test b/tests/modules/local/get_candidate_genes/main.nf.test new file mode 100644 index 00000000..992a5635 --- /dev/null +++ b/tests/modules/local/get_candidate_genes/main.nf.test @@ -0,0 +1,54 @@ +nextflow_process { + + name "Test Process GET_CANDIDATE_GENES" + script "modules/local/get_candidate_genes/main.nf" + process "GET_CANDIDATE_GENES" + tag "get_candidate_genes" + + test("Nb sections & candidates per section lower than total nb genes") { + + when { + + process { + """ + input[0] = file('$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = file('$projectDir/tests/test_data/base_statistics/output/stats_all_genes.csv', checkIfExists: true) + input[2] = "2" + input[3] = 3 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Too many sections") { + + when { + + process { + """ + input[0] = file('$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet', checkIfExists: true) + input[1] = file('$projectDir/tests/test_data/base_statistics/output/stats_all_genes.csv', checkIfExists: true) + input[2] = "50" + input[3] = 20 + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/get_candidate_genes/main.nf.test.snap b/tests/modules/local/get_candidate_genes/main.nf.test.snap new file mode 100644 index 00000000..7b869fd2 --- /dev/null +++ b/tests/modules/local/get_candidate_genes/main.nf.test.snap @@ -0,0 +1,132 @@ +{ + "Nb sections & candidates per section lower than total nb genes": { + "content": [ + { + "0": [ + [ + "section_1.candidate_counts.parquet:md5,7d1a1996214fb07741f2ad5c286fbc69", + "section_2.candidate_counts.parquet:md5,9e2de0ea75c3f839e38690f3d9a57b0b", + "section_3.candidate_counts.parquet:md5,860cdeb5dbfe7d24b2c12635ea85c10e" + ] + ], + "1": [ + [ + "section_1.stats.parquet:md5,3414fd57e9bf4f221b2df93be2e890a2", + "section_2.stats.parquet:md5,99b7bcc7944c77eb569b688c640d70f2", + "section_3.stats.parquet:md5,752dbfb5699fbe6847ac17a4fb6da51a" + ] + ], + "2": [ + [ + "GET_CANDIDATE_GENES", + "python", + "3.14.3" + ] + ], + "3": [ + [ + "GET_CANDIDATE_GENES", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + "section_1.candidate_counts.parquet:md5,7d1a1996214fb07741f2ad5c286fbc69", + "section_2.candidate_counts.parquet:md5,9e2de0ea75c3f839e38690f3d9a57b0b", + "section_3.candidate_counts.parquet:md5,860cdeb5dbfe7d24b2c12635ea85c10e" + ] + ], + "section_stats": [ + [ + "section_1.stats.parquet:md5,3414fd57e9bf4f221b2df93be2e890a2", + "section_2.stats.parquet:md5,99b7bcc7944c77eb569b688c640d70f2", + "section_3.stats.parquet:md5,752dbfb5699fbe6847ac17a4fb6da51a" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T17:07:03.292271274" + }, + "Too many sections": { + "content": [ + { + "0": [ + [ + "section_12.candidate_counts.parquet:md5,df8b5a6629b2b84b4c73156bbf261a92", + "section_14.candidate_counts.parquet:md5,1ef1a4e6cef6dd6b04ca28a505985bb5", + "section_16.candidate_counts.parquet:md5,f246e9291457873463ba3bff49e07b9d", + "section_18.candidate_counts.parquet:md5,f67b1be510bd1b41f58088ae08cc494c", + "section_20.candidate_counts.parquet:md5,3349d94fa3b42a917704c7abc2d807f9", + "section_3.candidate_counts.parquet:md5,2ddf5f2e7cca3e8df9930520b3131495", + "section_5.candidate_counts.parquet:md5,179d419cc48a36a991fbe74e4dcb28fa", + "section_7.candidate_counts.parquet:md5,41f08e512f5d44eff8fa0ce3d49ac0f4", + "section_9.candidate_counts.parquet:md5,85c942fa7ec2f2b4e5af6149de007328" + ] + ], + "1": [ + [ + "section_12.stats.parquet:md5,069d23175be9d1a733b4996895d4a3ce", + "section_14.stats.parquet:md5,73e852a1083f86d2d99d4f93ab6228c0", + "section_16.stats.parquet:md5,6aa616308faf4403194f18fae7cd1024", + "section_18.stats.parquet:md5,0b17005fab7582663111ea77cefca427", + "section_20.stats.parquet:md5,03c52c419ff94b63d0b16b2e9e87fa26", + "section_3.stats.parquet:md5,14a121ecab4116935fa9df136afc997a", + "section_5.stats.parquet:md5,3a3700641343056feabac2aa76626556", + "section_7.stats.parquet:md5,d6af8c940d55e449397b7fc0c428fedf", + "section_9.stats.parquet:md5,eb375761c7111b78bf8779bf71f876ef" + ] + ], + "2": [ + [ + "GET_CANDIDATE_GENES", + "python", + "3.14.3" + ] + ], + "3": [ + [ + "GET_CANDIDATE_GENES", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + "section_12.candidate_counts.parquet:md5,df8b5a6629b2b84b4c73156bbf261a92", + "section_14.candidate_counts.parquet:md5,1ef1a4e6cef6dd6b04ca28a505985bb5", + "section_16.candidate_counts.parquet:md5,f246e9291457873463ba3bff49e07b9d", + "section_18.candidate_counts.parquet:md5,f67b1be510bd1b41f58088ae08cc494c", + "section_20.candidate_counts.parquet:md5,3349d94fa3b42a917704c7abc2d807f9", + "section_3.candidate_counts.parquet:md5,2ddf5f2e7cca3e8df9930520b3131495", + "section_5.candidate_counts.parquet:md5,179d419cc48a36a991fbe74e4dcb28fa", + "section_7.candidate_counts.parquet:md5,41f08e512f5d44eff8fa0ce3d49ac0f4", + "section_9.candidate_counts.parquet:md5,85c942fa7ec2f2b4e5af6149de007328" + ] + ], + "section_stats": [ + [ + "section_12.stats.parquet:md5,069d23175be9d1a733b4996895d4a3ce", + "section_14.stats.parquet:md5,73e852a1083f86d2d99d4f93ab6228c0", + "section_16.stats.parquet:md5,6aa616308faf4403194f18fae7cd1024", + "section_18.stats.parquet:md5,0b17005fab7582663111ea77cefca427", + "section_20.stats.parquet:md5,03c52c419ff94b63d0b16b2e9e87fa26", + "section_3.stats.parquet:md5,14a121ecab4116935fa9df136afc997a", + "section_5.stats.parquet:md5,3a3700641343056feabac2aa76626556", + "section_7.stats.parquet:md5,d6af8c940d55e449397b7fc0c428fedf", + "section_9.stats.parquet:md5,eb375761c7111b78bf8779bf71f876ef" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T17:07:09.643611957" + } +} \ No newline at end of file diff --git a/tests/modules/local/gprofiler/idmapping/main.nf.test b/tests/modules/local/gprofiler/idmapping/main.nf.test new file mode 100644 index 00000000..e754ba0f --- /dev/null +++ b/tests/modules/local/gprofiler/idmapping/main.nf.test @@ -0,0 +1,51 @@ +nextflow_process { + + name "Test Process GPROFILER_IDMAPPING" + script "modules/local/gprofiler/idmapping/main.nf" + process "GPROFILER_IDMAPPING" + tag "gprofiler_idmapping" + + test("ENSG - Mapping found") { + + when { + + process { + """ + input[0] = file("$projectDir/tests/test_data/idmapping/gene_ids/gene_ids.txt", checkIfExists: true) + input[1] = "Solanum tuberosum" + input[2] = "ENSG" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.mapping.size() == 1 }, + { assert snapshot(process.out).match() } + ) + } + } + /* + test("Entrez - No mapping found") { + + when { + + process { + """ + input[0] = file("$projectDir/tests/test_data/idmapping/gene_ids/gene_ids.txt", checkIfExists: true) + input[1] = "Solanum tuberosum" + input[2] = "ENTREZGENE" + """ + } + } + + then { + assertAll( + { assert !process.success } + ) + } + } + */ + +} diff --git a/tests/modules/local/gprofiler/idmapping/main.nf.test.snap b/tests/modules/local/gprofiler/idmapping/main.nf.test.snap new file mode 100644 index 00000000..850dae54 --- /dev/null +++ b/tests/modules/local/gprofiler/idmapping/main.nf.test.snap @@ -0,0 +1,43 @@ +{ + "ENSG - Mapping found": { + "content": [ + { + "0": [ + "mapped_gene_ids.csv:md5,c4ef4df6530509b486662a107ba8de44" + ], + "1": [ + "gene_metadata.csv:md5,f4dad0185e6f2d780f561d3efc301562" + ], + "2": [ + [ + "GPROFILER_IDMAPPING", + "python", + "3.14.3" + ] + ], + "3": [ + [ + "GPROFILER_IDMAPPING", + "pandas", + "3.0.1" + ] + ], + "4": [ + [ + "GPROFILER_IDMAPPING", + "httpx", + "0.28.1" + ] + ], + "metadata": [ + "gene_metadata.csv:md5,f4dad0185e6f2d780f561d3efc301562" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-02-19T10:26:01.249646558" + } +} \ No newline at end of file diff --git a/tests/modules/local/merge_counts/main.nf.test b/tests/modules/local/merge_counts/main.nf.test new file mode 100644 index 00000000..89651847 --- /dev/null +++ b/tests/modules/local/merge_counts/main.nf.test @@ -0,0 +1,85 @@ +nextflow_process { + + name "Test Process MERGE_COUNTS" + script "modules/local/merge_counts/main.nf" + process "MERGE_COUNTS" + tag "merge_counts" + + test("3 files") { + + when { + + process { + """ + input[0] = [ + [ platform: 'rnaseq' ], + [ + file("$projectDir/tests/test_data/merge_data/input/counts1.parquet", checkIfExists: true), + file( "$projectDir/tests/test_data/merge_data/input/counts2.parquet", checkIfExists: true), + file( "$projectDir/tests/test_data/merge_data/input/counts3.parquet", checkIfExists: true), + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + /* + test("2 identical files") { + + when { + + process { + """ + input[0] = [ + [ platform: 'rnaseq' ], + [ + file("$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet", checkIfExists: true), + file( "$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet", checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + */ + test("1 file") { + + when { + + process { + """ + input[0] = [ + [ platform: 'microarray' ], + [ + file("$projectDir/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet", checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } +} diff --git a/tests/modules/local/merge_counts/main.nf.test.snap b/tests/modules/local/merge_counts/main.nf.test.snap new file mode 100644 index 00000000..6e773d44 --- /dev/null +++ b/tests/modules/local/merge_counts/main.nf.test.snap @@ -0,0 +1,84 @@ +{ + "1 file": { + "content": [ + { + "0": [ + [ + { + "platform": "microarray" + }, + "all_counts.parquet:md5,4ceb116e0a52b92ab31ec4e122ed12a1" + ] + ], + "1": [ + [ + "MERGE_COUNTS", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "MERGE_COUNTS", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "platform": "microarray" + }, + "all_counts.parquet:md5,4ceb116e0a52b92ab31ec4e122ed12a1" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T16:41:25.646239587" + }, + "3 files": { + "content": [ + { + "0": [ + [ + { + "platform": "rnaseq" + }, + "all_counts.parquet:md5,c519c7936217c9399081069a48539c07" + ] + ], + "1": [ + [ + "MERGE_COUNTS", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "MERGE_COUNTS", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "platform": "rnaseq" + }, + "all_counts.parquet:md5,c519c7936217c9399081069a48539c07" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T16:39:46.447995126" + } +} \ No newline at end of file diff --git a/tests/modules/local/normalisation/compute_cpm/main.nf.test b/tests/modules/local/normalisation/compute_cpm/main.nf.test new file mode 100644 index 00000000..c32312b8 --- /dev/null +++ b/tests/modules/local/normalisation/compute_cpm/main.nf.test @@ -0,0 +1,101 @@ +nextflow_process { + + name "Test Process NORMALISATION_COMPUTE_CPM" + script "modules/local/normalisation/compute_cpm/main.nf" + process "NORMALISATION_COMPUTE_CPM" + tag "cpm_norm" + + + test("Very small dataset") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test" ], + file('$projectDir/tests/test_data/normalisation/base/counts.csv') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Rows with many zeros") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test"], + file('$projectDir/tests/test_data/normalisation/many_zeros/counts.csv') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("One group") { + + when { + + process { + """ + input[0] = [ + [ dataset: "accession" ], + file('$projectDir/tests/test_data/normalisation/one_group/counts.csv') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("TSV files") { + + when { + + process { + """ + input[0] = [ + [ dataset: "accession" ], + file('$projectDir/tests/test_data/normalisation/base/counts.tsv') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/normalisation/compute_cpm/main.nf.test.snap b/tests/modules/local/normalisation/compute_cpm/main.nf.test.snap new file mode 100644 index 00000000..6f3d7957 --- /dev/null +++ b/tests/modules/local/normalisation/compute_cpm/main.nf.test.snap @@ -0,0 +1,190 @@ +{ + "Very small dataset": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "counts.cpm.parquet:md5,8802fdfa77c0da39062bf357dccdd3cd" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_CPM", + "python", + "3.14.3" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_CPM", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "counts.cpm.parquet:md5,8802fdfa77c0da39062bf357dccdd3cd" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-17T09:07:24.246785277" + }, + "One group": { + "content": [ + { + "0": [ + [ + { + "dataset": "accession" + }, + "counts.cpm.parquet:md5,c8855975f68aad3c3bb060a23c14e2f9" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_CPM", + "python", + "3.14.3" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_CPM", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "accession" + }, + "counts.cpm.parquet:md5,c8855975f68aad3c3bb060a23c14e2f9" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-19T12:23:45.874853063" + }, + "TSV files": { + "content": [ + { + "0": [ + [ + { + "dataset": "accession" + }, + "counts.cpm.parquet:md5,8802fdfa77c0da39062bf357dccdd3cd" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_CPM", + "python", + "3.14.3" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_CPM", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "accession" + }, + "counts.cpm.parquet:md5,8802fdfa77c0da39062bf357dccdd3cd" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-19T12:23:54.407797312" + }, + "Rows with many zeros": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "counts.cpm.parquet:md5,ab2596a5bb8b3b2e39754191a2dce2aa" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_CPM", + "python", + "3.14.3" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_CPM", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "counts.cpm.parquet:md5,ab2596a5bb8b3b2e39754191a2dce2aa" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-17T09:07:34.001566313" + } +} \ No newline at end of file diff --git a/tests/modules/local/normalisation/compute_tpm/main.nf.test b/tests/modules/local/normalisation/compute_tpm/main.nf.test new file mode 100644 index 00000000..23463223 --- /dev/null +++ b/tests/modules/local/normalisation/compute_tpm/main.nf.test @@ -0,0 +1,106 @@ +nextflow_process { + + name "Test Process NORMALISATION_COMPUTE_TPM" + script "modules/local/normalisation/compute_tpm/main.nf" + process "NORMALISATION_COMPUTE_TPM" + tag "tpm_norm" + + + test("Very small dataset") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test" ], + file('$projectDir/tests/test_data/normalisation/base/counts.csv') + ] + input[1] = file('$projectDir/tests/test_data/normalisation/base/gene_lengths.csv') + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Rows with many zeros") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test"], + file('$projectDir/tests/test_data/normalisation/many_zeros/counts.csv') + ] + input[1] = file('$projectDir/tests/test_data/normalisation/many_zeros/gene_lengths.csv') + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("One group") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test" ], + file('$projectDir/tests/test_data/normalisation/one_group/counts.csv') + ] + input[1] = file('$projectDir/tests/test_data/normalisation/one_group/gene_lengths.csv') + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("TSV files") { + + when { + + process { + """ + input[0] = [ + [ dataset: "test" ], + file('$projectDir/tests/test_data/normalisation/base/counts.tsv') + ] + input[1] = file('$projectDir/tests/test_data/normalisation/base/gene_lengths.csv') + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + +} diff --git a/tests/modules/local/normalisation/compute_tpm/main.nf.test.snap b/tests/modules/local/normalisation/compute_tpm/main.nf.test.snap new file mode 100644 index 00000000..287c19f0 --- /dev/null +++ b/tests/modules/local/normalisation/compute_tpm/main.nf.test.snap @@ -0,0 +1,190 @@ +{ + "Very small dataset": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "counts.tpm.parquet:md5,e8e08e6af6b76fe41793259203925e37" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_TPM", + "python", + "3.14.3" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_TPM", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "counts.tpm.parquet:md5,e8e08e6af6b76fe41793259203925e37" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-17T09:08:02.882187442" + }, + "One group": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "counts.tpm.parquet:md5,2bb5797b24bcd02a06b2794c94567638" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_TPM", + "python", + "3.14.3" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_TPM", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "counts.tpm.parquet:md5,2bb5797b24bcd02a06b2794c94567638" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-17T09:08:21.950858405" + }, + "TSV files": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "counts.tpm.parquet:md5,e8e08e6af6b76fe41793259203925e37" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_TPM", + "python", + "3.14.3" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_TPM", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "counts.tpm.parquet:md5,e8e08e6af6b76fe41793259203925e37" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-17T09:08:31.427239077" + }, + "Rows with many zeros": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "counts.tpm.parquet:md5,95563b1ba1083cfc31c2b9c18c5aeaaa" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + [ + "NORMALISATION_COMPUTE_TPM", + "python", + "3.14.3" + ] + ], + "4": [ + [ + "NORMALISATION_COMPUTE_TPM", + "polars", + "1.39.2" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "counts.tpm.parquet:md5,95563b1ba1083cfc31c2b9c18c5aeaaa" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-17T09:08:12.502224484" + } +} \ No newline at end of file diff --git a/tests/modules/local/normfinder/main.nf.test b/tests/modules/local/normfinder/main.nf.test new file mode 100644 index 00000000..105de0d3 --- /dev/null +++ b/tests/modules/local/normfinder/main.nf.test @@ -0,0 +1,50 @@ +nextflow_process { + + name "Test Process NORMFINDER" + script "modules/local/normfinder/main.nf" + process "NORMFINDER" + tag "normfinder" + + test("Very small dataset - Cq values") { + + when { + process { + """ + input[0] = [ + [section: "section_1"], + file( '$projectDir/tests/test_data/normfinder/very_small_cq/all_counts.normalised.parquet', checkIfExists: true) + ] + input[1] = file( '$projectDir/tests/test_data/normfinder/very_small_cq/design.csv', checkIfExists: true) + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + + test("Small dataset - Real expression values") { + + when { + process { + """ + input[0] = [ + [section: "section_1"], + file( '$projectDir/tests/test_data/normfinder/small_normalised/all_counts.normalised.parquet', checkIfExists: true) + ] + input[1] = file( '$projectDir/tests/test_data/normfinder/small_normalised/design.csv', checkIfExists: true) + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/tests/modules/local/normfinder/main.nf.test.snap b/tests/modules/local/normfinder/main.nf.test.snap new file mode 100644 index 00000000..0b2298f7 --- /dev/null +++ b/tests/modules/local/normfinder/main.nf.test.snap @@ -0,0 +1,126 @@ +{ + "Small dataset - Real expression values": { + "content": [ + { + "0": [ + [ + { + "section": "section_1" + }, + "stability_values.normfinder.csv:md5,05b3b9508930923bd86c281e8febe6b6" + ] + ], + "1": [ + [ + "NORMFINDER", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "NORMFINDER", + "polars", + "1.39.2" + ] + ], + "3": [ + [ + "NORMFINDER", + "tqdm", + "4.67.3" + ] + ], + "4": [ + [ + "NORMFINDER", + "numpy", + "2.4.3" + ] + ], + "5": [ + [ + "NORMFINDER", + "numba", + "0.64.0" + ] + ], + "stability_values": [ + [ + { + "section": "section_1" + }, + "stability_values.normfinder.csv:md5,05b3b9508930923bd86c281e8febe6b6" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T15:45:00.995645591" + }, + "Very small dataset - Cq values": { + "content": [ + { + "0": [ + [ + { + "section": "section_1" + }, + "stability_values.normfinder.csv:md5,a7c936faa9135439fd1b86c195f60414" + ] + ], + "1": [ + [ + "NORMFINDER", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "NORMFINDER", + "polars", + "1.39.2" + ] + ], + "3": [ + [ + "NORMFINDER", + "tqdm", + "4.67.3" + ] + ], + "4": [ + [ + "NORMFINDER", + "numpy", + "2.4.3" + ] + ], + "5": [ + [ + "NORMFINDER", + "numba", + "0.64.0" + ] + ], + "stability_values": [ + [ + { + "section": "section_1" + }, + "stability_values.normfinder.csv:md5,a7c936faa9135439fd1b86c195f60414" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-30T15:44:51.060894512" + } +} \ No newline at end of file diff --git a/tests/modules/local/quantile_normalisation/main.nf.test b/tests/modules/local/quantile_normalisation/main.nf.test new file mode 100644 index 00000000..469c52bb --- /dev/null +++ b/tests/modules/local/quantile_normalisation/main.nf.test @@ -0,0 +1,54 @@ +nextflow_process { + + name "Test Process QUANTILE_NORMALISATION" + script "modules/local/quantile_normalisation/main.nf" + process "QUANTILE_NORMALISATION" + tag "quant_norm" + + test("Uniform target distribution") { + + when { + process { + """ + input[0] = [ + [dataset: 'test'], + file( '$projectDir/tests/test_data/quantile_normalisation/count.raw.cpm.csv', checkIfExists: true) + ] + input[1] = "uniform" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Normal target distribution") { + + when { + process { + """ + input[0] = [ + [dataset: 'test'], + file( '$projectDir/tests/test_data/quantile_normalisation/count.raw.cpm.csv', checkIfExists: true) + ] + input[1] = "normal" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/tests/modules/local/quantile_normalisation/main.nf.test.snap b/tests/modules/local/quantile_normalisation/main.nf.test.snap new file mode 100644 index 00000000..1e2bce1c --- /dev/null +++ b/tests/modules/local/quantile_normalisation/main.nf.test.snap @@ -0,0 +1,98 @@ +{ + "Uniform target distribution": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "count.raw.cpm.quant_norm.parquet:md5,4ceb116e0a52b92ab31ec4e122ed12a1" + ] + ], + "1": [ + [ + "QUANTILE_NORMALISATION", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "QUANTILE_NORMALISATION", + "polars", + "1.39.2" + ] + ], + "3": [ + [ + "QUANTILE_NORMALISATION", + "scikit-learn", + "1.8.0" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "count.raw.cpm.quant_norm.parquet:md5,4ceb116e0a52b92ab31ec4e122ed12a1" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-17T09:09:10.597987851" + }, + "Normal target distribution": { + "content": [ + { + "0": [ + [ + { + "dataset": "test" + }, + "count.raw.cpm.quant_norm.parquet:md5,10c118fd62dad210b585f30620679732" + ] + ], + "1": [ + [ + "QUANTILE_NORMALISATION", + "python", + "3.14.3" + ] + ], + "2": [ + [ + "QUANTILE_NORMALISATION", + "polars", + "1.39.2" + ] + ], + "3": [ + [ + "QUANTILE_NORMALISATION", + "scikit-learn", + "1.8.0" + ] + ], + "counts": [ + [ + { + "dataset": "test" + }, + "count.raw.cpm.quant_norm.parquet:md5,10c118fd62dad210b585f30620679732" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-17T09:09:22.718260106" + } +} \ No newline at end of file diff --git a/tests/nextflow.config b/tests/nextflow.config index dd96159d..70fa952f 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -12,3 +12,4 @@ params { } aws.client.anonymous = true // fixes S3 access issues on self-hosted runners +enable.moduleBinaries = true diff --git a/tests/subworkflows/local/download_public_datasets/main.nf.test b/tests/subworkflows/local/download_public_datasets/main.nf.test new file mode 100644 index 00000000..54391b25 --- /dev/null +++ b/tests/subworkflows/local/download_public_datasets/main.nf.test @@ -0,0 +1,52 @@ +nextflow_workflow { + + name "Test Workflow DOWNLOAD_PUBLIC_DATASETS" + script "subworkflows/local/download_public_datasets/main.nf" + workflow "DOWNLOAD_PUBLIC_DATASETS" + tag "download_public_datasets" + + test("Beta vulgaris - Eatlas + GEO - all accessions") { + + when { + params { + species = 'beta vulgaris' + } + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = channel.fromList(['E-MTAB-8187', 'GSE107627', 'GSE114968', 'GSE135555', 'GSE205413', 'GSE269454', 'GSE281272', 'GSE55951', 'GSE79526', 'GSE92859']) + """ + } + } + + then { + assert workflow.success + assert snapshot(workflow.out).match() + } + + } + + test("Beta vulgaris - Eatlas only") { + + when { + params { + species = 'beta vulgaris' + } + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = channel.fromList(['E-MTAB-8187']) + """ + } + } + + then { + assert workflow.success + assert snapshot(workflow.out).match() + } + + } + + + +} diff --git a/tests/subworkflows/local/download_public_datasets/main.nf.test.snap b/tests/subworkflows/local/download_public_datasets/main.nf.test.snap new file mode 100644 index 00000000..3e299483 --- /dev/null +++ b/tests/subworkflows/local/download_public_datasets/main.nf.test.snap @@ -0,0 +1,86 @@ +{ + "Beta vulgaris - Eatlas only": { + "content": [ + { + "0": [ + [ + { + "dataset": "E_MTAB_8187_rnaseq", + "design": "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "normalised": false, + "platform": "rnaseq" + }, + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1" + ] + ], + "datasets": [ + [ + { + "dataset": "E_MTAB_8187_rnaseq", + "design": "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "normalised": false, + "platform": "rnaseq" + }, + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-16T15:18:21.726044151" + }, + "Beta vulgaris - Eatlas + GEO - all accessions": { + "content": [ + { + "0": [ + [ + { + "dataset": "E_MTAB_8187_rnaseq", + "design": "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "normalised": false, + "platform": "rnaseq" + }, + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1" + ], + [ + { + "dataset": "GSE55951_GPL18429", + "design": "GSE55951_GPL18429.microarray.normalised.design.csv:md5,f4872dff0edbe441d1600ffe2b67a25d", + "normalised": true, + "platform": "microarray" + }, + "GSE55951_GPL18429.microarray.normalised.counts.csv:md5,18fd2d728ad2ec5cb78f994f73375144" + ] + ], + "datasets": [ + [ + { + "dataset": "E_MTAB_8187_rnaseq", + "design": "E_MTAB_8187_rnaseq.design.csv:md5,fbd18d011d7d855452e5a30a303afcbf", + "normalised": false, + "platform": "rnaseq" + }, + "E_MTAB_8187_rnaseq.rnaseq.raw.counts.csv:md5,fe221fd94f66df7120b0590091e14eb1" + ], + [ + { + "dataset": "GSE55951_GPL18429", + "design": "GSE55951_GPL18429.microarray.normalised.design.csv:md5,f4872dff0edbe441d1600ffe2b67a25d", + "normalised": true, + "platform": "microarray" + }, + "GSE55951_GPL18429.microarray.normalised.counts.csv:md5,18fd2d728ad2ec5cb78f994f73375144" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-16T15:18:08.622422246" + } +} \ No newline at end of file diff --git a/tests/subworkflows/local/expression_normalisation/main.nf.test b/tests/subworkflows/local/expression_normalisation/main.nf.test new file mode 100644 index 00000000..34632aff --- /dev/null +++ b/tests/subworkflows/local/expression_normalisation/main.nf.test @@ -0,0 +1,172 @@ +nextflow_workflow { + + name "Test Workflow EXPRESSION_NORMALISATION" + script "subworkflows/local/expression_normalisation/main.nf" + workflow "EXPRESSION_NORMALISATION" + tag "subworkflow_expression_normalisation" + tag "subworkflow" + + test("TPM Normalisation") { + + when { + workflow { + """ + input[0] = "solanum_tuberosum" + input[1] = channel.of( + [ + [ + normalised: false, + design: file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.design.csv', checkIfExists: true ), + dataset: "rnaseq_raw", + platform: "rnaseq" + ], + file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.csv', checkIfExists: true ) + ], + [ + [ + normalised: true, + design: file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.design.csv', checkIfExists: true ), + dataset: "microarray_normalised", + platform: "microarray" + ], + file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.csv', checkIfExists: true ) + ] + ) + input[2] = "tpm" + input[3] = "uniform" + input[4] = null + input[5] = null + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + test("TPM Normalisation with gene length") { + + when { + workflow { + """ + input[0] = "solanum_tuberosum" + input[1] = channel.of( + [ + [ + normalised: false, + design: file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.design.csv', checkIfExists: true ), + dataset: "rnaseq_raw", + platform: "rnaseq" + ], + file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.csv', checkIfExists: true ) + ], + [ + [ + normalised: true, + design: file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.design.csv', checkIfExists: true ), + dataset: "microarray_normalised", + platform: "microarray" + ], + file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.csv', checkIfExists: true ) + ] + ) + input[2] = "tpm" + input[3] = "uniform" + input[4] = null + input[5] = file( '$projectDir/tests/test_data/input_datasets/gene_lengths.csv', checkIfExists: true ) + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + test("CPM Normalisation") { + + when { + workflow { + """ + input[0] = "solanum_tuberosum" + input[1] = channel.of( + [ + [ + normalised: false, + design: file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.design.csv', checkIfExists: true ), + dataset: "rnaseq_raw", + platform: "rnaseq" + ], + file( '$projectDir/tests/test_data/input_datasets/rnaseq.raw.csv', checkIfExists: true ) + ], + [ + [ + normalised: true, + design: file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.design.csv', checkIfExists: true ), + dataset: "microarray_normalised", + platform: "microarray" + ], + file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.csv', checkIfExists: true ) + ] + ) + input[2] = "cpm" + input[3] = "uniform" + input[4] = null + input[5] = null + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + test("No rnaseq normalisation") { + + when { + workflow { + """ + input[0] = "solanum_tuberosum" + input[1] = channel.of( + [ + [ + normalised: true, + design: file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.design.csv', checkIfExists: true ), + dataset: "microarray_normalised", + platform: "microarray" + ], + file( '$projectDir/tests/test_data/input_datasets/microarray.normalised.csv', checkIfExists: true ) + ] + ) + input[2] = "tpm " + input[3] = "uniform" + input[4] = null + input[5] = null + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + +} diff --git a/tests/subworkflows/local/expression_normalisation/main.nf.test.snap b/tests/subworkflows/local/expression_normalisation/main.nf.test.snap new file mode 100644 index 00000000..04a5c4ec --- /dev/null +++ b/tests/subworkflows/local/expression_normalisation/main.nf.test.snap @@ -0,0 +1,188 @@ +{ + "CPM Normalisation": { + "content": [ + { + "0": [ + [ + { + "normalised": false, + "design": "rnaseq.raw.design.csv:md5,39470b02a211aff791f9e4851b017488", + "dataset": "rnaseq_raw", + "platform": "rnaseq" + }, + "rnaseq.raw.cpm.quant_norm.parquet:md5,9f7988ca916b47ed614c824e001d2512" + ], + [ + { + "normalised": true, + "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", + "dataset": "microarray_normalised", + "platform": "microarray" + }, + "microarray.normalised.quant_norm.parquet:md5,0f9ed5a872e8c424a9ccc83b1c33753f" + ] + ], + "counts": [ + [ + { + "normalised": false, + "design": "rnaseq.raw.design.csv:md5,39470b02a211aff791f9e4851b017488", + "dataset": "rnaseq_raw", + "platform": "rnaseq" + }, + "rnaseq.raw.cpm.quant_norm.parquet:md5,9f7988ca916b47ed614c824e001d2512" + ], + [ + { + "normalised": true, + "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", + "dataset": "microarray_normalised", + "platform": "microarray" + }, + "microarray.normalised.quant_norm.parquet:md5,0f9ed5a872e8c424a9ccc83b1c33753f" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-19T12:27:13.766132141" + }, + "No rnaseq normalisation": { + "content": [ + { + "0": [ + [ + { + "normalised": true, + "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", + "dataset": "microarray_normalised", + "platform": "microarray" + }, + "microarray.normalised.quant_norm.parquet:md5,0f9ed5a872e8c424a9ccc83b1c33753f" + ] + ], + "counts": [ + [ + { + "normalised": true, + "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", + "dataset": "microarray_normalised", + "platform": "microarray" + }, + "microarray.normalised.quant_norm.parquet:md5,0f9ed5a872e8c424a9ccc83b1c33753f" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-19T12:27:25.897836784" + }, + "TPM Normalisation with gene length": { + "content": [ + { + "0": [ + [ + { + "normalised": false, + "design": "rnaseq.raw.design.csv:md5,39470b02a211aff791f9e4851b017488", + "dataset": "rnaseq_raw", + "platform": "rnaseq" + }, + "rnaseq.raw.tpm.quant_norm.parquet:md5,590b3bd6ec2b09533ef75ce9950d3a92" + ], + [ + { + "normalised": true, + "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", + "dataset": "microarray_normalised", + "platform": "microarray" + }, + "microarray.normalised.quant_norm.parquet:md5,0f9ed5a872e8c424a9ccc83b1c33753f" + ] + ], + "counts": [ + [ + { + "normalised": false, + "design": "rnaseq.raw.design.csv:md5,39470b02a211aff791f9e4851b017488", + "dataset": "rnaseq_raw", + "platform": "rnaseq" + }, + "rnaseq.raw.tpm.quant_norm.parquet:md5,590b3bd6ec2b09533ef75ce9950d3a92" + ], + [ + { + "normalised": true, + "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", + "dataset": "microarray_normalised", + "platform": "microarray" + }, + "microarray.normalised.quant_norm.parquet:md5,0f9ed5a872e8c424a9ccc83b1c33753f" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-19T12:27:00.268510601" + }, + "TPM Normalisation": { + "content": [ + { + "0": [ + [ + { + "normalised": false, + "design": "rnaseq.raw.design.csv:md5,39470b02a211aff791f9e4851b017488", + "dataset": "rnaseq_raw", + "platform": "rnaseq" + }, + "rnaseq.raw.tpm.quant_norm.parquet:md5,d0e926a720de0803775b0dbd118b03ac" + ], + [ + { + "normalised": true, + "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", + "dataset": "microarray_normalised", + "platform": "microarray" + }, + "microarray.normalised.quant_norm.parquet:md5,0f9ed5a872e8c424a9ccc83b1c33753f" + ] + ], + "counts": [ + [ + { + "normalised": false, + "design": "rnaseq.raw.design.csv:md5,39470b02a211aff791f9e4851b017488", + "dataset": "rnaseq_raw", + "platform": "rnaseq" + }, + "rnaseq.raw.tpm.quant_norm.parquet:md5,d0e926a720de0803775b0dbd118b03ac" + ], + [ + { + "normalised": true, + "design": "microarray.normalised.design.csv:md5,9662cc2f58d86cc552d6ff9cf094dd67", + "dataset": "microarray_normalised", + "platform": "microarray" + }, + "microarray.normalised.quant_norm.parquet:md5,0f9ed5a872e8c424a9ccc83b1c33753f" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-19T12:26:44.852023368" + } +} \ No newline at end of file diff --git a/tests/subworkflows/local/genorm/main.nf.test b/tests/subworkflows/local/genorm/main.nf.test new file mode 100644 index 00000000..c45f2553 --- /dev/null +++ b/tests/subworkflows/local/genorm/main.nf.test @@ -0,0 +1,53 @@ +nextflow_workflow { + + name "Test Workflow genorm" + script "subworkflows/local/genorm/main.nf" + workflow "GENORM" + tag "subworkflow_genorm" + tag "subworkflow" + + test("10 genes") { + + tag "subworkflow_genorm_10_genes" + + when { + workflow { + """ + input[0] = channel.of([ + [section: "section_1"], + file( '$projectDir/tests/test_data/genorm/make_chunks/input/counts.head.parquet', checkIfExists: true) + ]) + """ + } + } + + then { + assert workflow.success + assert snapshot(workflow.out).match() + } + + } + + test("1000 genes") { + + tag "subworkflow_genorm_1000_genes" + + when { + workflow { + """ + input[0] = channel.of( [ + [section: "section_1"], + file( '$projectDir/tests/test_data/genorm/make_chunks/input/counts.parquet', checkIfExists: true) + ]) + """ + } + } + + then { + assert workflow.success + assert snapshot(workflow.out).match() + } + + } + +} diff --git a/tests/subworkflows/local/genorm/main.nf.test.snap b/tests/subworkflows/local/genorm/main.nf.test.snap new file mode 100644 index 00000000..411b8f60 --- /dev/null +++ b/tests/subworkflows/local/genorm/main.nf.test.snap @@ -0,0 +1,56 @@ +{ + "1000 genes": { + "content": [ + { + "0": [ + [ + { + "section": "section_1" + }, + "m_measures.csv:md5,2119b16fe13e2d0bc0fedc3c9d3d1733" + ] + ], + "m_measures": [ + [ + { + "section": "section_1" + }, + "m_measures.csv:md5,2119b16fe13e2d0bc0fedc3c9d3d1733" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-01T09:56:47.48692894" + }, + "10 genes": { + "content": [ + { + "0": [ + [ + { + "section": "section_1" + }, + "m_measures.csv:md5,8bfea16844f247e2b871a8f559a3dd73" + ] + ], + "m_measures": [ + [ + { + "section": "section_1" + }, + "m_measures.csv:md5,8bfea16844f247e2b871a8f559a3dd73" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-01T09:55:53.207791305" + } +} \ No newline at end of file diff --git a/tests/subworkflows/local/genorm/run_genorm.py b/tests/subworkflows/local/genorm/run_genorm.py new file mode 100644 index 00000000..9704d7dc --- /dev/null +++ b/tests/subworkflows/local/genorm/run_genorm.py @@ -0,0 +1,44 @@ +import sys + +import numpy as np +import pandas as pd + +file = sys.argv[1] +# Expression data for three control genes. +counts = pd.read_parquet(file) +counts.set_index("gene_id", inplace=True) +counts = counts.T.replace(0, 1e-8) + + +def _m_numpy(gene_expression: np.ndarray) -> np.ndarray: + """Internal control gene-stability measure `M`. + + Computes Eq. (4) in Ref. [1]. + + [1]: Vandesompele, Jo, et al. "Accurate normalization of real-time quantitative + RT-PCR data by geometric averaging of multiple internal control genes." Genome + biology 3.7 (2002): 1-12. + """ + + if not (gene_expression > 0).all(): + raise ValueError( + "Expression domain error: not all expression data are strictly positive!" + ) + + a = gene_expression + # Eq. (2): A_{jk}^{(i)} = log_2 (a_{ij} / a_{ik}) + A = np.log2(np.einsum("ij,ik->ijk", a, 1 / a)) + # Eq. (3) + V = np.std(A, axis=0) + # Eq. (4) N.B., Since V_{j=k} is zero, we can simply ignore it since it does not + # contribute to calculation. + n = V.shape[1] + return np.sum(V, axis=1) / (n - 1) + + +def m_measure(gene_expression): + m_values = _m_numpy(gene_expression.to_numpy()) + return pd.Series(m_values, index=gene_expression.columns) + + +print(m_measure(counts).sort_values()) diff --git a/tests/subworkflows/local/get_public_accessions/main.nf.test b/tests/subworkflows/local/get_public_accessions/main.nf.test new file mode 100644 index 00000000..dc449417 --- /dev/null +++ b/tests/subworkflows/local/get_public_accessions/main.nf.test @@ -0,0 +1,241 @@ +nextflow_workflow { + + name "Test Workflow GET_PUBLIC_ACCESSIONS" + script "subworkflows/local/get_public_accessions/main.nf" + workflow "GET_PUBLIC_ACCESSIONS" + tag "get_public_accessions" + + test("Fetch eatlas accessions without keywords") { + + when { + + params { + species = 'beta vulgaris' + skip_fetch_eatlas_accessions = false + fetch_geo_accessions = false + platform = null + keywords = "" + accessions = "" + accessions_file = null + excluded_accessions = "" + excluded_accessions_file = null + random_sampling_size = null + random_sampling_seed = 42 + outdir = "$outputDir" + } + + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = params.skip_fetch_eatlas_accessions + input[2] = params.fetch_geo_accessions + input[3] = params.platform + input[4] = params.keywords + input[5] = channel.fromList( params.accessions.tokenize(',') ) + input[6] = params.accessions_file ? channel.fromPath(params.accessions_file, checkIfExists: true) : channel.empty() + input[7] = channel.fromList( params.excluded_accessions.tokenize(',') ) + input[8] = params.excluded_accessions_file ? channel.fromPath(params.excluded_accessions_file, checkIfExists: true) : channel.empty() + input[9] = params.random_sampling_size + input[10] = params.random_sampling_seed + input[11] = params.outdir + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + /* + //TODO: see why it gives issues in CI + test("Fetch public accessions with keywords + GEO") { + + when { + + params { + species = 'beta vulgaris' + skip_fetch_eatlas_accessions = false + fetch_geo_accessions = true + platform = null + keywords = "leaf" + accessions = "" + accessions_file = null + excluded_accessions = "" + excluded_accessions_file = null + random_sampling_size = null + random_sampling_seed = 42 + outdir = "$outputDir" + } + + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = params.skip_fetch_eatlas_accessions + input[2] = params.fetch_geo_accessions + input[3] = params.platform + input[4] = params.keywords + input[5] = channel.fromList( params.accessions.tokenize(',') ) + input[6] = params.accessions_file ? channel.fromPath(params.accessions_file, checkIfExists: true) : channel.empty() + input[7] = channel.fromList( params.excluded_accessions.tokenize(',') ) + input[8] = params.excluded_accessions_file ? channel.fromPath(params.excluded_accessions_file, checkIfExists: true) : channel.empty() + input[9] = params.random_sampling_size + input[10] = params.random_sampling_seed + input[11] = params.outdir + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + */ + + test("No GEO + accessions provided") { + + when { + + params { + species = 'beta vulgaris' + skip_fetch_eatlas_accessions = false + fetch_geo_accessions = false + platform = null + keywords = "" + accessions = "E-MTAB-552,E-GEOD-61690 ,E-PROT-138" + accessions_file = null + excluded_accessions = "" + excluded_accessions_file = null + random_sampling_size = null + random_sampling_seed = 42 + outdir = "$outputDir" + } + + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = params.skip_fetch_eatlas_accessions + input[2] = params.fetch_geo_accessions + input[3] = params.platform + input[4] = params.keywords + input[5] = channel.fromList( params.accessions.tokenize(',') ) + input[6] = params.accessions_file ? channel.fromPath(params.accessions_file, checkIfExists: true) : channel.empty() + input[7] = channel.fromList( params.excluded_accessions.tokenize(',') ) + input[8] = params.excluded_accessions_file ? channel.fromPath(params.excluded_accessions_file, checkIfExists: true) : channel.empty() + input[9] = params.random_sampling_size + input[10] = params.random_sampling_seed + input[11] = params.outdir + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + test("Accessions file + Excluded accessions file") { + + when { + params { + species = 'beta vulgaris' + skip_fetch_eatlas_accessions = false + fetch_geo_accessions = false + platform = null + keywords = "" + accessions = "E-MTAB-552,E-GEOD-61690 ,E-PROT-138" + accessions_file = null + excluded_accessions = "" + excluded_accessions_file = file( '$projectDir/tests/test_data/public_accessions/exclude_one_two_accessions.txt', checkIfExists: true ) + random_sampling_size = null + random_sampling_seed = 42 + outdir = "$outputDir" + } + + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = params.skip_fetch_eatlas_accessions + input[2] = params.fetch_geo_accessions + input[3] = params.platform + input[4] = params.keywords + input[5] = channel.fromList( params.accessions.tokenize(',') ) + input[6] = params.accessions_file ? channel.fromPath(params.accessions_file, checkIfExists: true) : channel.empty() + input[7] = channel.fromList( params.excluded_accessions.tokenize(',') ) + input[8] = params.excluded_accessions_file ? channel.fromPath(params.excluded_accessions_file, checkIfExists: true) : channel.empty() + input[9] = params.random_sampling_size + input[10] = params.random_sampling_seed + input[11] = params.outdir + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + test("With samplling size") { + + when { + + params { + species = 'beta vulgaris' + skip_fetch_eatlas_accessions = false + fetch_geo_accessions = false + platform = null + keywords = "" + accessions = "" + accessions_file = null + excluded_accessions = "" + excluded_accessions_file = null + random_sampling_size = 2 + random_sampling_seed = 42 + outdir = "$outputDir" + } + + workflow { + """ + input[0] = params.species.split(' ').join('_') + input[1] = params.skip_fetch_eatlas_accessions + input[2] = params.fetch_geo_accessions + input[3] = params.platform + input[4] = params.keywords + input[5] = channel.fromList( params.accessions.tokenize(',') ) + input[6] = params.accessions_file ? channel.fromPath(params.accessions_file, checkIfExists: true) : channel.empty() + input[7] = channel.fromList( params.excluded_accessions.tokenize(',') ) + input[8] = params.excluded_accessions_file ? channel.fromPath(params.excluded_accessions_file, checkIfExists: true) : channel.empty() + input[9] = params.random_sampling_size + input[10] = params.random_sampling_seed + input[11] = params.outdir + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + + } + + +} diff --git a/tests/subworkflows/local/get_public_accessions/main.nf.test.snap b/tests/subworkflows/local/get_public_accessions/main.nf.test.snap new file mode 100644 index 00000000..d19a70fb --- /dev/null +++ b/tests/subworkflows/local/get_public_accessions/main.nf.test.snap @@ -0,0 +1,88 @@ +{ + "Accessions file + Excluded accessions file": { + "content": [ + { + "0": [ + "E-ENAD-2", + "E-GEOD-61690", + "E-MTAB-552", + "E-MTAB-8187", + "E-PROT-138" + ], + "accessions": [ + "E-ENAD-2", + "E-GEOD-61690", + "E-MTAB-552", + "E-MTAB-8187", + "E-PROT-138" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-16T12:25:30.194867058" + }, + "No GEO + accessions provided": { + "content": [ + { + "0": [ + "E-ENAD-2", + "E-GEOD-61690", + "E-MTAB-552", + "E-MTAB-8187", + "E-PROT-138" + ], + "accessions": [ + "E-ENAD-2", + "E-GEOD-61690", + "E-MTAB-552", + "E-MTAB-8187", + "E-PROT-138" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-16T12:25:15.508377196" + }, + "Fetch eatlas accessions without keywords": { + "content": [ + { + "0": [ + "E-ENAD-2", + "E-MTAB-8187" + ], + "accessions": [ + "E-ENAD-2", + "E-MTAB-8187" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-16T12:25:00.873486754" + }, + "With samplling size": { + "content": [ + { + "0": [ + + ], + "accessions": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-01T14:53:02.05138484" + } +} \ No newline at end of file diff --git a/tests/test_data/aggregate_results/mapping.csv b/tests/test_data/aggregate_results/mapping.csv new file mode 100644 index 00000000..b3c00132 --- /dev/null +++ b/tests/test_data/aggregate_results/mapping.csv @@ -0,0 +1,4 @@ +original_gene_id,gene_id +ENSRNA049434199,ENSRNA049454747 +ENSRNA049434246,ENSRNA049454887 +ENSRNA049434252,SNSRNA049434252 diff --git a/tests/test_data/aggregate_results/metadata.csv b/tests/test_data/aggregate_results/metadata.csv new file mode 100644 index 00000000..d4985a9f --- /dev/null +++ b/tests/test_data/aggregate_results/metadata.csv @@ -0,0 +1,4 @@ +gene_id,name,description +ENSRNA049454747,geneA,descriptionA +ENSRNA049454887,geneB,descriptionB +ENSRNA049454947,geneC,descriptionC diff --git a/tests/test_data/aggregate_results/microarray_stats_all_genes.csv b/tests/test_data/aggregate_results/microarray_stats_all_genes.csv new file mode 100644 index 00000000..0fe7d08f --- /dev/null +++ b/tests/test_data/aggregate_results/microarray_stats_all_genes.csv @@ -0,0 +1,10 @@ +gene_id,microarray_mean,microarray_standard_deviation,microarray_median,microarray_median_absolute_deviation,microarray_coefficient_of_variation,microarray_robust_coefficient_of_variation_median,microarray_ratio_nulls_in_all_samples,microarray_ratio_nulls_in_valid_samples,microarray_ratio_zeros,microarray_expression_level_quantile_interval +ENSRNA049454747,0.9375,0.11572751247156893,1.0,0.0,0.12344267996967352,0.0,0.0,0.0,0.0,99 +ENSRNA049454887,0.140625,0.15580293184477811,0.125,0.125,1.1079319597850887,1.4826,0.0,0.0,0.5,11 +ENSRNA049454931,0.4453125,0.12246309575308217,0.4375,0.0625,0.2750048466034126,0.2118,0.0,0.0,0.0,66 +ENSRNA049454947,0.3984375,0.1887975933374152,0.375,0.125,0.4738449401409636,0.4942,0.0,0.0,0.0,44 +ENSRNA049454955,0.421875,0.18525441001112883,0.4375,0.15625,0.4391215644708239,0.5295,0.0,0.0,0.0,55 +ENSRNA049454963,0.78125,0.08838834764831845,0.75,0.0625,0.1131370849898476,0.12355,0.0,0.0,0.0,77 +ENSRNA049454974,0.859375,0.12387890112063936,0.875,0.0625,0.14414999403128945,0.1059,0.0,0.0,0.0,88 +ENSRNA049455639,0.15625,0.20863074009907004,0.125,0.125,1.3352367366340483,1.4826,0.0,0.0,0.375,22 +ENSRNA049455690,0.328125,0.34028283928856257,0.3125,0.3125,1.0370524625937145,1.4826,0.0,0.0,0.375,33 diff --git a/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv b/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv new file mode 100644 index 00000000..de9af372 --- /dev/null +++ b/tests/test_data/aggregate_results/rnaseq_stats_all_genes.csv @@ -0,0 +1,10 @@ +gene_id,rnaseq_mean,rnaseq_standard_deviation,rnaseq_median,rnaseq_median_absolute_deviation,rnaseq_coefficient_of_variation,rnaseq_robust_coefficient_of_variation_median,rnaseq_ratio_nulls_in_all_samples,rnaseq_ratio_nulls_in_valid_samples,rnaseq_ratio_zeros,rnaseq_expression_level_quantile_interval +ENSRNA049454747,0.9375,0.11572751247156893,1.0,0.0,0.12344267996967352,0.0,0.0,0.0,0.0,99 +ENSRNA049454887,0.140625,0.15580293184477811,0.125,0.125,1.1079319597850887,1.4826,0.0,0.0,0.5,11 +ENSRNA049454931,0.4453125,0.12246309575308217,0.4375,0.0625,0.2750048466034126,0.2118,0.0,0.0,0.0,66 +ENSRNA049454947,0.3984375,0.1887975933374152,0.375,0.125,0.4738449401409636,0.4942,0.0,0.0,0.0,44 +ENSRNA049454955,0.421875,0.18525441001112883,0.4375,0.15625,0.4391215644708239,0.5295,0.0,0.0,0.0,55 +ENSRNA049454963,0.78125,0.08838834764831845,0.75,0.0625,0.1131370849898476,0.12355,0.0,0.0,0.0,77 +ENSRNA049454974,0.859375,0.12387890112063936,0.875,0.0625,0.14414999403128945,0.1059,0.0,0.0,0.0,88 +ENSRNA049455639,0.15625,0.20863074009907004,0.125,0.125,1.3352367366340483,1.4826,0.0,0.0,0.375,22 +ENSRNA049455690,0.328125,0.34028283928856257,0.3125,0.3125,1.0370524625937145,1.4826,0.0,0.0,0.375,33 diff --git a/tests/test_data/base_statistics/output/section_1.stats_with_scores.csv b/tests/test_data/base_statistics/output/section_1.stats_with_scores.csv new file mode 100644 index 00000000..c210a841 --- /dev/null +++ b/tests/test_data/base_statistics/output/section_1.stats_with_scores.csv @@ -0,0 +1,6 @@ +gene_id,mean,standard_deviation,median,median_absolute_deviation,coefficient_of_variation,robust_coefficient_of_variation_median,ratio_nulls_in_all_samples,ratio_nulls_in_valid_samples,ratio_zeros,expression_level_quantile_interval,section,normfinder_stability_value,genorm_m_measure,is_candidate,normfinder_stability_value_normalised,genorm_m_measure_normalised,coefficient_of_variation_normalised,robust_coefficient_of_variation_median_normalised,stability_score,rank +ENSRNA049454747,0.570564,0.010209,0.568117,0.008682,0.017892,0.022657,0.000000,0.000000,0.000000,56.000000,9,0.004712,0.067012,1,0.000346,0.000000,0.000000,0.031965,0.032311,1 +ENSRNA049454887,0.552805,0.014706,0.552715,0.009310,0.026603,0.024974,0.000000,0.000000,0.000000,55.000000,9,0.006991,0.071275,1,0.197714,0.097993,0.224146,0.072973,0.592827,2 +ENSRNA049454931,0.556514,0.016277,0.555356,0.012927,0.029249,0.034509,0.000000,0.000000,0.000000,55.000000,9,0.005713,0.070772,1,0.087036,0.086431,0.292232,0.241735,0.707433,3 +ENSRNA049454947,0.565699,0.017542,0.563547,0.009311,0.031010,0.024495,0.000000,0.000000,0.000000,56.000000,9,0.006086,0.076305,1,0.119338,0.213617,0.337545,0.064496,0.734996,4 +ENSRNA049454955,0.577896,0.017702,0.576416,0.012490,0.030632,0.032127,0.000000,0.000000,0.000000,57.000000,9,0.006420,0.069699,1,0.148264,0.061766,0.327818,0.199575,0.737423,5 diff --git a/tests/test_data/base_statistics/output/section_2.stats_with_scores.csv b/tests/test_data/base_statistics/output/section_2.stats_with_scores.csv new file mode 100644 index 00000000..d0d6d6e8 --- /dev/null +++ b/tests/test_data/base_statistics/output/section_2.stats_with_scores.csv @@ -0,0 +1,5 @@ +gene_id,mean,standard_deviation,median,median_absolute_deviation,coefficient_of_variation,robust_coefficient_of_variation_median,ratio_nulls_in_all_samples,ratio_nulls_in_valid_samples,ratio_zeros,expression_level_quantile_interval,section,normfinder_stability_value,genorm_m_measure,is_candidate,normfinder_stability_value_normalised,genorm_m_measure_normalised,coefficient_of_variation_normalised,robust_coefficient_of_variation_median_normalised,stability_score,rank +ENSRNA049454963,0.997524,0.000782,0.997419,0.000419,0.000784,0.000622,0.000000,0.000000,0.000000,99.000000,1,0.000125,0.002924,1,0.006574,0.001386,0.177943,0.089186,0.275089,6 +ENSRNA049454974,0.997944,0.000658,0.998069,0.000409,0.000659,0.000607,0.000000,0.000000,0.000000,99.000000,1,0.000185,0.003060,1,0.050402,0.039091,0.143564,0.086399,0.319456,7 +ENSRNA049455639,0.997911,0.000919,0.997909,0.000533,0.000921,0.000791,0.000000,0.000000,0.000000,99.000000,1,0.000116,0.002919,1,0.000000,0.000000,0.215622,0.120587,0.336209,8 +ENSRNA049455690,0.996857,0.000889,0.996528,0.000433,0.000892,0.000645,0.000000,0.000000,0.000000,99.000000,1,0.000155,0.002944,1,0.028488,0.006931,0.207646,0.093460,0.336524,9 diff --git a/tests/test_data/base_statistics/output/stats_all_genes.csv b/tests/test_data/base_statistics/output/stats_all_genes.csv new file mode 100644 index 00000000..9c2d792e --- /dev/null +++ b/tests/test_data/base_statistics/output/stats_all_genes.csv @@ -0,0 +1,10 @@ +gene_id,mean,standard_deviation,median,median_absolute_deviation,coefficient_of_variation,robust_coefficient_of_variation_median,ratio_nulls_in_all_samples,ratio_nulls_in_valid_samples,ratio_zeros,expression_level_quantile_interval +ENSRNA049454747,0.204895,0.197240,0.332892,0.111337,0.962641,0.495860,0.000000,0.000000,0.466667,28 +ENSRNA049454887,0.525767,0.039664,0.515980,0.014747,0.075440,0.042374,0.000000,0.000000,0.000000,52 +ENSRNA049454931,0.429906,0.040942,0.439106,0.028691,0.095235,0.096872,0.000000,0.000000,0.000000,43 +ENSRNA049454947,0.337136,0.023450,0.332792,0.010226,0.069556,0.045556,0.000000,0.000000,0.000000,35 +ENSRNA049454955,0.356393,0.077994,0.367554,0.033003,0.218844,0.133124,0.000000,0.000000,0.033333,37 +ENSRNA049454963,0.473395,0.040190,0.468429,0.021211,0.084898,0.067134,0.000000,0.000000,0.000000,47 +ENSRNA049454974,0.652818,0.120681,0.623259,0.073014,0.184861,0.173684,0.000000,0.000000,0.000000,65 +ENSRNA049455639,0.566799,0.038299,0.562763,0.025460,0.067571,0.067073,0.000000,0.000000,0.000000,56 +ENSRNA049455690,0.653952,0.036833,0.647126,0.016865,0.056324,0.038639,0.000000,0.000000,0.000000,65 diff --git a/tests/test_data/compute_gene_statistics/input/design.csv b/tests/test_data/compute_gene_statistics/input/design.csv new file mode 100644 index 00000000..d3e8694c --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/design.csv @@ -0,0 +1,28 @@ +sample,condition,batch +ARR029909,g1,A +ARR029910,g1,A +ARR029911,g1,A +ARR029912,g2,A +ARR029913,g2,A +ARR029914,g2,A +ARR029915,g3,A +ARR029916,g3,A +ARR029917,g3,A +URR029909,g1,B +URR029910,g1,B +URR029911,g1,B +URR029912,g2,B +URR029913,g2,B +URR029914,g2,B +URR029915,g3,B +URR029916,g3,B +URR029917,g3,B +ERR029909,g1,C +ERR029910,g1,C +ERR029911,g1,C +ERR029912,g2,C +ERR029913,g2,C +ERR029914,g2,C +ERR029915,g3,C +ERR029916,g3,C +ERR029917,g3,C diff --git a/tests/test_data/compute_gene_statistics/input/gene_counts.csv b/tests/test_data/compute_gene_statistics/input/gene_counts.csv new file mode 100644 index 00000000..fad53618 --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/gene_counts.csv @@ -0,0 +1,28 @@ +sample,count +ARR029909,4 +ARR029910,4 +ARR029911,4 +ARR029912,4 +ARR029913,4 +ARR029914,4 +ARR029915,4 +ARR029916,4 +ARR029917,4 +URR029909,2 +URR029910,2 +URR029911,2 +URR029912,2 +URR029913,2 +URR029914,2 +URR029915,2 +URR029916,2 +URR029917,2 +ERR029909,3 +ERR029910,3 +ERR029911,3 +ERR029912,3 +ERR029913,3 +ERR029914,3 +ERR029915,3 +ERR029916,3 +ERR029917,3 diff --git a/tests/test_data/compute_gene_statistics/input/ks_stats.csv b/tests/test_data/compute_gene_statistics/input/ks_stats.csv new file mode 100644 index 00000000..119c4ae5 --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/ks_stats.csv @@ -0,0 +1,27 @@ +URR029909,0.99 +URR029910,0.58 +URR029911,0.24 +URR029912,0.12 +URR029913,0.05 +URR029914,0.0 +URR029915,0.897 +URR029916,0.999 +URR029917,0.23 +ERR029909,0.45 +ERR029910,0.87 +ERR029911,0.456 +ERR029912,0.457 +ERR029913,0.78 +ERR029914,0.32 +ERR029915,0.56 +ERR029916,0.45 +ERR029917,0.12 +ARR029909,0.21 +ARR029910,0.0000005 +ARR029911,0 +ARR029912,0.789 +ARR029913,0.987 +ARR029914,0.876 +ARR029915,0.123 +ARR029916,0.321 +ARR029917,0.156 diff --git a/tests/test_data/compute_gene_statistics/input/mapping1.csv b/tests/test_data/compute_gene_statistics/input/mapping1.csv new file mode 100644 index 00000000..8c5865b4 --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/mapping1.csv @@ -0,0 +1,9 @@ +original_gene_id,gene_id +Q8VWG3,AT1G34790 +Q9FJA2,AT5G35550 +Q8RYD9,AT5G23260 +ABCD12,AT5G23261 +840386,AT1G34790 +833520,AT5G35550 +832390,AT5G23260 +123456,AT5G35550 diff --git a/tests/test_data/compute_gene_statistics/input/mapping2.csv b/tests/test_data/compute_gene_statistics/input/mapping2.csv new file mode 100644 index 00000000..080dbefd --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/mapping2.csv @@ -0,0 +1,9 @@ +original_gene_id,gene_id +Q8VWG3,AT1G34790 +Q9FJA2,AT5G35550 +Q8RYD9,AT5G23260 +ABCD12,AT5G23261 +840386,AT1G34790 +833520,AT5G35550 +832390,AT5G23260 +457862,AT5G23260 diff --git a/tests/test_data/compute_gene_statistics/input/mapping3.csv b/tests/test_data/compute_gene_statistics/input/mapping3.csv new file mode 100644 index 00000000..c8fbe3f9 --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/mapping3.csv @@ -0,0 +1,5 @@ +original_gene_id,gene_id +Q8VWG3,AT1G34790 +Q9FJA2,AT5G35550 +Q8RYD9,AT5G23260 +152348,AT1G23260 diff --git a/tests/test_data/compute_gene_statistics/input/metadata1.csv b/tests/test_data/compute_gene_statistics/input/metadata1.csv new file mode 100644 index 00000000..399628bf --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/metadata1.csv @@ -0,0 +1,5 @@ +gene_id,name,description +AT1G34790,TT1,C2H2 and C2HC zinc fingers superfamily protein +AT5G35550,TT2,Duplicated homeodomain-like superfamily protein +AT5G23260,TT16,K-box region and MADS-box transcription factor family protein +AT5G23261,TT23,blabla diff --git a/tests/test_data/compute_gene_statistics/input/metadata2.csv b/tests/test_data/compute_gene_statistics/input/metadata2.csv new file mode 100644 index 00000000..69fadca4 --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/metadata2.csv @@ -0,0 +1,4 @@ +gene_id,name,description +AT1G34790,TT1,C2H2 and C2HC zinc fingers superfamily protein +AT5G35550,TT2,Duplicated homeodomain-like superfamily protein +AT5G23260,TT16,K-box region and MADS-box transcription factor family protein diff --git a/tests/test_data/compute_gene_statistics/input/microarray_stats_all_genes.csv b/tests/test_data/compute_gene_statistics/input/microarray_stats_all_genes.csv new file mode 100644 index 00000000..e40090d6 --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/microarray_stats_all_genes.csv @@ -0,0 +1,8 @@ +gene_id,microarray_mean,microarray_standard_deviation,microarray_median,microarray_median_absolute_deviation,microarray_variation_coefficient,microarray_total_nb_nulls,microarray_nb_nulls_valid_samples,microarray_stability_score,microarray_expression_level_quantile_interval +AT1G34790,0.6041722385984585,0.2965945020346847,0.8210634736950527,0.07852066041592076,0.49091051042450434,678,643,1.4392880915454482,71 +AT5G35550,0.04211885958141837,0.017403154131542625,0.04081717758449555,0.00889668425147598,0.41319148487155133,678,643,1.3615690659924953,0 +AT5G23260,0.3265572056851324,0.12636844695328353,0.2977133397782717,0.09861099799987358,0.3869718528738528,678,643,1.3353494339947967,35 +AT5G23261,0.05948100952172446,0.0268768665570047,0.049569984365840696,0.021228253513649518,0.4518562608993441,678,643,1.400233842020288,1 +AT1G34790,0.5791984846868644,0.16532007773816776,0.5865184277282238,0.13319224137108376,0.28542905775650596,70,35,0.337051476635562,68 +AT5G35550,0.4069181057633956,0.2662419700433056,0.26506770843115524,0.13156965253473574,0.6542888268484007,678,643,1.6026664079693447,46 +AT5G23260,0.12079194562039748,0.060559689529495545,0.10818687095210754,0.0368400391021249,0.5013553612242599,678,643,1.449732942345204,7 diff --git a/tests/test_data/compute_gene_statistics/input/ratio_nulls_per_sample.csv b/tests/test_data/compute_gene_statistics/input/ratio_nulls_per_sample.csv new file mode 100644 index 00000000..fd6b8853 --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/ratio_nulls_per_sample.csv @@ -0,0 +1,9 @@ +sample,ratio +sample_63,0.0 +sample_64,0.0 +sample_65,0.0 +sample_66,0.0 +sample_67,0.0 +sample_68,0.0 +sample_69,0.0 +sample_70,0.0 diff --git a/tests/test_data/compute_gene_statistics/input/rnaseq_stats_all_genes.csv b/tests/test_data/compute_gene_statistics/input/rnaseq_stats_all_genes.csv new file mode 100644 index 00000000..e4c7327d --- /dev/null +++ b/tests/test_data/compute_gene_statistics/input/rnaseq_stats_all_genes.csv @@ -0,0 +1,8 @@ +gene_id,rnaseq_mean,rnaseq_standard_deviation,rnaseq_median,rnaseq_median_absolute_deviation,rnaseq_variation_coefficient,rnaseq_total_nb_nulls,rnaseq_nb_nulls_valid_samples,rnaseq_stability_score,rnaseq_expression_level_quantile_interval +AT1G34790,0.029004004004004002,0.061217504567865136,0.0,0.0,2.110657016852365,345,336,3.0544772415714663,0 +AT5G35550,0.2921254587921254,0.028005675342417956,0.28128128128128127,0.025025025025025016,0.09586865676896245,356,347,1.070587757892558,41 +AT5G23260,0.051621388830691145,0.04715133948046024,0.04154154154154154,0.027027027027027035,0.9134070304677029,322,313,1.7926205136137703,3 +AT5G23261,0.06000444889333778,0.0796183056079376,0.030030030030030026,0.030030030030030026,1.3268733748303374,356,347,2.301592475953933,5 +AT1G34790,0.027638749860972082,0.019581626793675158,0.025525525525525526,0.014014014014014014,0.7084845332069752,356,347,1.6832036343305707,0 +AT5G35550,0.07687920478618152,0.05023977809403856,0.06906906906906907,0.03603603603603604,0.6534898251583997,322,313,1.532703308304467,8 +AT5G23260,0.05421550582840906,0.0785887308235655,0.0,0.0,1.4495618849761762,303,294,2.2754045816053896,4 diff --git a/tests/test_data/compute_stability_scores/input/genorm.m_measures.csv b/tests/test_data/compute_stability_scores/input/genorm.m_measures.csv new file mode 100644 index 00000000..b4b6ae10 --- /dev/null +++ b/tests/test_data/compute_stability_scores/input/genorm.m_measures.csv @@ -0,0 +1,10 @@ +gene_id,genorm_m_measure +ENSRNA049454747,0.16034699963469335 +ENSRNA049454887,0.525024672172669794 +ENSRNA049454931,0.264017707597323344 +ENSRNA049454947,0.037074358179388235 +ENSRNA049454955,0.65294154739420848 +ENSRNA049454963,0.213698246698642331 +ENSRNA049454974,0.16807095772646336 +ENSRNA049455639,0.02698654413301954 +ENSRNA049455690,0.57785261216485885 diff --git a/tests/test_data/compute_stability_scores/input/stability_values.normfinder.csv b/tests/test_data/compute_stability_scores/input/stability_values.normfinder.csv new file mode 100644 index 00000000..238572e2 --- /dev/null +++ b/tests/test_data/compute_stability_scores/input/stability_values.normfinder.csv @@ -0,0 +1,10 @@ +gene_id,normfinder_stability_value +ENSRNA049454747,0.036034699963469335 +ENSRNA049454887,0.05024672172669794 +ENSRNA049454931,0.014017707597323344 +ENSRNA049454947,0.037074358179388235 +ENSRNA049454955,0.03294154739420848 +ENSRNA049454963,0.03698246698642331 +ENSRNA049454974,0.06807095772646336 +ENSRNA049455639,0.02698654413301954 +ENSRNA049455690,0.07785261216485885 diff --git a/tests/test_data/compute_stability_scores/input/stats_all_genes.csv b/tests/test_data/compute_stability_scores/input/stats_all_genes.csv new file mode 100644 index 00000000..c5ef7f74 --- /dev/null +++ b/tests/test_data/compute_stability_scores/input/stats_all_genes.csv @@ -0,0 +1,10 @@ +gene_id,mean,standard_deviation,median,median_absolute_deviation,coefficient_of_variation,robust_coefficient_of_variation_median,ratio_nulls_in_all_samples,ratio_nulls_in_valid_samples,ratio_zeros,expression_level_quantile_interval,section +ENSRNA049454747,0.9375,0.11572751247156893,1.0,0.0,0.12344267996967352,0.0,0.0,0.0,0.0,99,19 +ENSRNA049454887,0.140625,0.15580293184477811,0.125,0.125,1.1079319597850887,1.4826,0.0,0.0,0.5,11,19 +ENSRNA049454931,0.4453125,0.12246309575308217,0.4375,0.0625,0.2750048466034126,0.2118,0.0,0.0,0.0,66,19 +ENSRNA049454947,0.3984375,0.1887975933374152,0.375,0.125,0.4738449401409636,0.4942,0.0,0.0,0.0,44,19 +ENSRNA049454955,0.421875,0.18525441001112883,0.4375,0.15625,0.4391215644708239,0.5295,0.0,0.0,0.0,55,19 +ENSRNA049454963,0.78125,0.08838834764831845,0.75,0.0625,0.1131370849898476,0.12355,0.0,0.0,0.0,77,19 +ENSRNA049454974,0.859375,0.12387890112063936,0.875,0.0625,0.14414999403128945,0.1059,0.0,0.0,0.0,88,19 +ENSRNA049455639,0.15625,0.20863074009907004,0.125,0.125,1.3352367366340483,1.4826,0.0,0.0,0.375,22,19 +ENSRNA049455690,0.328125,0.34028283928856257,0.3125,0.3125,1.0370524625937145,1.4826,0.0,0.0,0.375,33,19 diff --git a/tests/test_data/compute_stability_scores/input/stats_all_genes.parquet b/tests/test_data/compute_stability_scores/input/stats_all_genes.parquet new file mode 100644 index 00000000..04b808d5 Binary files /dev/null and b/tests/test_data/compute_stability_scores/input/stats_all_genes.parquet differ diff --git a/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet b/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet new file mode 100644 index 00000000..c3f8fa07 Binary files /dev/null and b/tests/test_data/dataset_statistics/input/count.raw.cpm.quant_norm.parquet differ diff --git a/tests/test_data/dataset_statistics/input/count2.raw.cpm.quant_norm.parquet b/tests/test_data/dataset_statistics/input/count2.raw.cpm.quant_norm.parquet new file mode 100644 index 00000000..d9082e66 Binary files /dev/null and b/tests/test_data/dataset_statistics/input/count2.raw.cpm.quant_norm.parquet differ diff --git a/tests/test_data/dataset_statistics/output/test.dataset_stats.csv b/tests/test_data/dataset_statistics/output/test.dataset_stats.csv new file mode 100644 index 00000000..ead888ce --- /dev/null +++ b/tests/test_data/dataset_statistics/output/test.dataset_stats.csv @@ -0,0 +1,9 @@ +sample,count,mean,std,min,25%,50%,75%,max,skewness,kolmogorov_smirnov_pvalue +sample_63,9.0,0.5,0.3423265984407288,0.0,0.25,0.5,0.75,1.0,0.0,0.013238665147108418 +sample_64,9.0,0.5,0.34089725358236606,0.0,0.25,0.5625,0.75,1.0,-0.0059425832940604335,0.013238665147108418 +sample_65,9.0,0.5,0.3423265984407288,0.0,0.25,0.5,0.75,1.0,0.0,0.013238665147108418 +sample_66,9.0,0.5,0.3423265984407288,0.0,0.25,0.5,0.75,1.0,0.0,0.013238665147108418 +sample_67,9.0,0.4861111111111111,0.361444824435364,0.0,0.25,0.5,0.75,1.0,-0.09766083489340871,0.013238665147108418 +sample_68,9.0,0.4861111111111111,0.361444824435364,0.0,0.25,0.5,0.75,1.0,-0.09766083489340871,0.013238665147108418 +sample_69,9.0,0.5,0.3423265984407288,0.0,0.25,0.5,0.75,1.0,0.0,0.013238665147108418 +sample_70,9.0,0.5,0.34089725358236606,0.0,0.3125,0.5,0.75,1.0,0.0178277498821813,0.013238665147108418 diff --git a/tests/test_data/genorm/compute_m_measure/input/std.0.0.parquet b/tests/test_data/genorm/compute_m_measure/input/std.0.0.parquet new file mode 100644 index 00000000..23b14fa8 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.0.0.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.0.1.parquet b/tests/test_data/genorm/compute_m_measure/input/std.0.1.parquet new file mode 100644 index 00000000..2df52c3b Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.0.1.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.0.2.parquet b/tests/test_data/genorm/compute_m_measure/input/std.0.2.parquet new file mode 100644 index 00000000..48e587cf Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.0.2.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.0.3.parquet b/tests/test_data/genorm/compute_m_measure/input/std.0.3.parquet new file mode 100644 index 00000000..2984fee1 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.0.3.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.1.1.parquet b/tests/test_data/genorm/compute_m_measure/input/std.1.1.parquet new file mode 100644 index 00000000..fae48626 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.1.1.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet b/tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet new file mode 100644 index 00000000..5dcaaf98 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.1.2.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.1.3.parquet b/tests/test_data/genorm/compute_m_measure/input/std.1.3.parquet new file mode 100644 index 00000000..b297f2a0 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.1.3.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.2.2.parquet b/tests/test_data/genorm/compute_m_measure/input/std.2.2.parquet new file mode 100644 index 00000000..3b7cda0f Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.2.2.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.2.3.parquet b/tests/test_data/genorm/compute_m_measure/input/std.2.3.parquet new file mode 100644 index 00000000..168d2c51 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.2.3.parquet differ diff --git a/tests/test_data/genorm/compute_m_measure/input/std.3.3.parquet b/tests/test_data/genorm/compute_m_measure/input/std.3.3.parquet new file mode 100644 index 00000000..e1fff9b2 Binary files /dev/null and b/tests/test_data/genorm/compute_m_measure/input/std.3.3.parquet differ diff --git a/tests/test_data/genorm/cross_join/output/cross_join.0.1.parquet b/tests/test_data/genorm/cross_join/output/cross_join.0.1.parquet new file mode 100644 index 00000000..d92b69ef Binary files /dev/null and b/tests/test_data/genorm/cross_join/output/cross_join.0.1.parquet differ diff --git a/tests/test_data/genorm/expression_ratio/output/ratios.0.1.parquet b/tests/test_data/genorm/expression_ratio/output/ratios.0.1.parquet new file mode 100644 index 00000000..b55c4965 Binary files /dev/null and b/tests/test_data/genorm/expression_ratio/output/ratios.0.1.parquet differ diff --git a/tests/test_data/genorm/make_chunks/input/counts.head.parquet b/tests/test_data/genorm/make_chunks/input/counts.head.parquet new file mode 100644 index 00000000..b63b13ef Binary files /dev/null and b/tests/test_data/genorm/make_chunks/input/counts.head.parquet differ diff --git a/tests/test_data/genorm/make_chunks/input/counts.parquet b/tests/test_data/genorm/make_chunks/input/counts.parquet new file mode 100644 index 00000000..c9764863 Binary files /dev/null and b/tests/test_data/genorm/make_chunks/input/counts.parquet differ diff --git a/tests/test_data/genorm/make_chunks/output/count_chunk.0.parquet b/tests/test_data/genorm/make_chunks/output/count_chunk.0.parquet new file mode 100644 index 00000000..2367ea1b Binary files /dev/null and b/tests/test_data/genorm/make_chunks/output/count_chunk.0.parquet differ diff --git a/tests/test_data/genorm/make_chunks/output/count_chunk.1.parquet b/tests/test_data/genorm/make_chunks/output/count_chunk.1.parquet new file mode 100644 index 00000000..98442e57 Binary files /dev/null and b/tests/test_data/genorm/make_chunks/output/count_chunk.1.parquet differ diff --git a/tests/test_data/genorm/make_chunks/output/count_chunk.2.parquet b/tests/test_data/genorm/make_chunks/output/count_chunk.2.parquet new file mode 100644 index 00000000..5e207c7a Binary files /dev/null and b/tests/test_data/genorm/make_chunks/output/count_chunk.2.parquet differ diff --git a/tests/test_data/genorm/ratio_standard_variation/output/std.0.1.parquet b/tests/test_data/genorm/ratio_standard_variation/output/std.0.1.parquet new file mode 100644 index 00000000..995652e8 Binary files /dev/null and b/tests/test_data/genorm/ratio_standard_variation/output/std.0.1.parquet differ diff --git a/tests/test_data/genorm/ratio_standard_variation/output/std.0.2.parquet b/tests/test_data/genorm/ratio_standard_variation/output/std.0.2.parquet new file mode 100644 index 00000000..976ff07d Binary files /dev/null and b/tests/test_data/genorm/ratio_standard_variation/output/std.0.2.parquet differ diff --git a/tests/test_data/idmapping/base/counts.ensembl_ids.csv b/tests/test_data/idmapping/base/counts.ensembl_ids.csv new file mode 100644 index 00000000..a093ec4b --- /dev/null +++ b/tests/test_data/idmapping/base/counts.ensembl_ids.csv @@ -0,0 +1,4 @@ +gend_id,ERR029909,ERR029910,ERR029911,ERR029912,ERR029913,ERR029914,ERR029915,ERR029916,ERR029917,ERR029918,ERR029920,ERR029921,ERR029922,ERR029923,ERR029924 +ENSRNA049434199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +ENSRNA049434246,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +ENSRNA049434252,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/tests/test_data/idmapping/base/counts.ncbi_ids.csv b/tests/test_data/idmapping/base/counts.ncbi_ids.csv new file mode 100644 index 00000000..b52dfe8d --- /dev/null +++ b/tests/test_data/idmapping/base/counts.ncbi_ids.csv @@ -0,0 +1,4 @@ +ERR029909,ERR029910,ERR029911,ERR029912,ERR029913,ERR029914,ERR029915,ERR029916,ERR029917,ERR029918,ERR029920,ERR029921,ERR029922,ERR029923,ERR029924 +840386,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +833520,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +832390,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/tests/test_data/idmapping/base/counts.uniprot_ids.csv b/tests/test_data/idmapping/base/counts.uniprot_ids.csv new file mode 100644 index 00000000..9a30df90 --- /dev/null +++ b/tests/test_data/idmapping/base/counts.uniprot_ids.csv @@ -0,0 +1,4 @@ +ERR029909,ERR029910,ERR029911,ERR029912,ERR029913,ERR029914,ERR029915,ERR029916,ERR029917,ERR029918,ERR029920,ERR029921,ERR029922,ERR029923,ERR029924 +Q8VWG3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +Q9FJA2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +Q8RYD9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/tests/test_data/idmapping/custom/mapping.csv b/tests/test_data/idmapping/custom/mapping.csv new file mode 100644 index 00000000..cd43e30f --- /dev/null +++ b/tests/test_data/idmapping/custom/mapping.csv @@ -0,0 +1,4 @@ +original_gene_id,gene_id +ENSRNA049434199,SNSRNA049434199 +ENSRNA049434246,SNSRNA049434246 +ENSRNA049434252,SNSRNA049434252 diff --git a/tests/test_data/idmapping/custom/metadata.csv b/tests/test_data/idmapping/custom/metadata.csv new file mode 100644 index 00000000..0c4095a9 --- /dev/null +++ b/tests/test_data/idmapping/custom/metadata.csv @@ -0,0 +1,4 @@ +gene_id,name,description +SNSRNA049434199,geneA,descriptionA +SNSRNA049434246,geneB,descriptionB +SNSRNA049434252,geneC,descriptionC diff --git a/tests/test_data/idmapping/empty/counts.csv b/tests/test_data/idmapping/empty/counts.csv new file mode 100644 index 00000000..b8d84b76 --- /dev/null +++ b/tests/test_data/idmapping/empty/counts.csv @@ -0,0 +1 @@ +sample_1,sample_2,sample_3 diff --git a/tests/test_data/idmapping/gene_ids/gene_ids.txt b/tests/test_data/idmapping/gene_ids/gene_ids.txt new file mode 100644 index 00000000..94233419 --- /dev/null +++ b/tests/test_data/idmapping/gene_ids/gene_ids.txt @@ -0,0 +1,9 @@ +ENSRNA049434199 +ENSRNA049434246 +ENSRNA049434252 +840386 +833520 +832390 +Q8VWG3 +Q9FJA2 +Q8RYD9 diff --git a/tests/test_data/idmapping/mapped/mapped_gene_ids.csv b/tests/test_data/idmapping/mapped/mapped_gene_ids.csv new file mode 100644 index 00000000..84561688 --- /dev/null +++ b/tests/test_data/idmapping/mapped/mapped_gene_ids.csv @@ -0,0 +1,4 @@ +original_gene_id,gene_id +ENSRNA049434199,ENSRNA049434199 +ENSRNA049434246,ENSRNA049434246 +ENSRNA049434252,ENSRNA049434252 diff --git a/tests/test_data/idmapping/mapped/no_valid_gene_id.txt b/tests/test_data/idmapping/mapped/no_valid_gene_id.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_data/idmapping/mapped/valid_gene_ids.txt b/tests/test_data/idmapping/mapped/valid_gene_ids.txt new file mode 100644 index 00000000..4fc4b319 --- /dev/null +++ b/tests/test_data/idmapping/mapped/valid_gene_ids.txt @@ -0,0 +1,2 @@ +ENSRNA049434199 +ENSRNA049434246 diff --git a/tests/test_data/idmapping/not_found/counts.csv b/tests/test_data/idmapping/not_found/counts.csv new file mode 100644 index 00000000..2b8ebd50 --- /dev/null +++ b/tests/test_data/idmapping/not_found/counts.csv @@ -0,0 +1,4 @@ +sample_1,sample_2,sample_3 +8173941,1,2,3 +8168737,1,2,3 +8067017,1,2,3 diff --git a/tests/test_data/idmapping/tsv/counts.ensembl_ids.tsv b/tests/test_data/idmapping/tsv/counts.ensembl_ids.tsv new file mode 100644 index 00000000..b1e1511d --- /dev/null +++ b/tests/test_data/idmapping/tsv/counts.ensembl_ids.tsv @@ -0,0 +1,4 @@ +gene_id ERR029909 ERR029910 ERR029911 ERR029912 ERR029913 ERR029914 ERR029915 ERR029916 ERR029917 ERR029918 ERR029920 ERR029921 ERR029922 ERR029923 ERR029924 +ENSRNA049434199 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +ENSRNA049434246 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +ENSRNA049434252 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 diff --git a/tests/test_data/idmapping/tsv/mapping.tsv b/tests/test_data/idmapping/tsv/mapping.tsv new file mode 100644 index 00000000..c425f89f --- /dev/null +++ b/tests/test_data/idmapping/tsv/mapping.tsv @@ -0,0 +1,4 @@ +original_gene_id gene_id +ENSRNA049434199 SNSRNA049434199 +ENSRNA049434246 SNSRNA049434246 +ENSRNA049434252 SNSRNA049434252 diff --git a/tests/test_data/idmapping/tsv/metadata.tsv b/tests/test_data/idmapping/tsv/metadata.tsv new file mode 100644 index 00000000..11eae353 --- /dev/null +++ b/tests/test_data/idmapping/tsv/metadata.tsv @@ -0,0 +1,4 @@ +gene_id name description +SNSRNA049434199 geneA descriptionA +SNSRNA049434246 geneB descriptionB +SNSRNA049434252 geneC descriptionC diff --git a/tests/test_data/idmapping/tsv/valid_gene_ids.txt b/tests/test_data/idmapping/tsv/valid_gene_ids.txt new file mode 100644 index 00000000..4fc4b319 --- /dev/null +++ b/tests/test_data/idmapping/tsv/valid_gene_ids.txt @@ -0,0 +1,2 @@ +ENSRNA049434199 +ENSRNA049434246 diff --git a/tests/test_data/input_datasets/gene_lengths.csv b/tests/test_data/input_datasets/gene_lengths.csv new file mode 100644 index 00000000..03ffba68 --- /dev/null +++ b/tests/test_data/input_datasets/gene_lengths.csv @@ -0,0 +1,10 @@ +gene_id,length +ENSRNA049453121,100 +ENSRNA049453138,200 +ENSRNA049454388,300 +ENSRNA049454416,400 +ENSRNA049454647,500 +ENSRNA049454661,600 +ENSRNA049454747,700 +ENSRNA049454887,800 +ENSRNA049454931,900 diff --git a/tests/test_data/input_datasets/input.csv b/tests/test_data/input_datasets/input.csv new file mode 100644 index 00000000..73278d53 --- /dev/null +++ b/tests/test_data/input_datasets/input.csv @@ -0,0 +1,3 @@ +counts,design,platform,normalised +https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/microarray.normalised.csv,https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/microarray.normalised.design.csv,microarray,true +https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/rnaseq.raw.csv,https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/rnaseq.raw.design.csv,rnaseq,false diff --git a/tests/test_data/input_datasets/input_big.yaml b/tests/test_data/input_datasets/input_big.yaml new file mode 100644 index 00000000..f54577bb --- /dev/null +++ b/tests/test_data/input_datasets/input_big.yaml @@ -0,0 +1,4 @@ +- counts: https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/modules_testdata/SRP254919.salmon.merged.gene_counts.top1000cov.assay.tsv + design: https://raw.githubusercontent.com/nf-core/test-datasets/stableexpression/test_data/input_datasets/rnaseq_big.design.csv + platform: rnaseq + normalised: false diff --git a/tests/test_data/input_datasets/mapping.csv b/tests/test_data/input_datasets/mapping.csv new file mode 100644 index 00000000..04489426 --- /dev/null +++ b/tests/test_data/input_datasets/mapping.csv @@ -0,0 +1,10 @@ +original_gene_id,gene_id +ENSRNA049453121,SNSRNA049434199 +ENSRNA049453138,SNSRNA049434246 +ENSRNA049454388,SNSRNA049434252 +ENSRNA049454416,SNSRNA049434253 +ENSRNA049454647,SNSRNA049434254 +ENSRNA049454661,SNSRNA049434255 +ENSRNA049454747,SNSRNA049434256 +ENSRNA049454887,SNSRNA049434257 +ENSRNA049454931,SNSRNA049434258 diff --git a/tests/test_data/input_datasets/metadata.csv b/tests/test_data/input_datasets/metadata.csv new file mode 100644 index 00000000..fcccf222 --- /dev/null +++ b/tests/test_data/input_datasets/metadata.csv @@ -0,0 +1,10 @@ +gene_id,name,description +ENSRNA049453121,geneA,descriptionA +ENSRNA049453138,geneB,descriptionB +ENSRNA049454388,geneC,descriptionC +ENSRNA049454416,geneD,descriptionD +ENSRNA049454647,geneE,descriptionE +ENSRNA049454661,geneF,descriptionF +ENSRNA049454747,geneG,descriptionG +ENSRNA049454887,geneH,descriptionH +ENSRNA049454931,geneI,descriptionI diff --git a/tests/test_data/input_datasets/microarray.normalised.csv b/tests/test_data/input_datasets/microarray.normalised.csv new file mode 100644 index 00000000..81f3f904 --- /dev/null +++ b/tests/test_data/input_datasets/microarray.normalised.csv @@ -0,0 +1,10 @@ +gene_id,GSM1528575,GSM1528576,GSM1528579,GSM1528583,GSM1528584,GSM1528585,GSM1528580,GSM1528586,GSM1528582,GSM1528578,GSM1528581,GSM1528577 +ENSRNA049453121,20925.1255070264,136184.261516502,144325.370645564,89427.0987612997,164143.182734208,34178.6378088171,28842.7323281157,76973.395782103,41906.9367255656,44756.5602263121,252562.049703724,6953.65643340122 +ENSRNA049453138,196173.051628372,16607.8367703051,344972.83715281,22602.4535330758,13678.598561184,104546.421532852,15451.4637472048,71664.8857281649,160643.257448002,91459.0578537683,88396.7173963033,281623.08555275 +ENSRNA049454388,91547.4240932405,11625.4857392136,84483.143792525,80582.6604222701,218857.576978944,58304.7350856292,42234.0009090266,88475.1675656357,87306.1181782617,17513.436610296,90922.3378933406,76490.2207674135 +ENSRNA049454416,20925.1255070264,106290.155329953,193607.204524536,47170.3378081581,392119.825420608,190998.270108096,90648.5873169351,81397.1541603848,83813.8734511313,165404.67909724,111127.301869638,194702.380135234 +ENSRNA049454647,99394.3461583754,91343.1022366783,3520.13099135521,71738.2220832404,118547.854196928,20105.0810640101,81377.7090686122,15040.7784861581,66352.6498154789,110918.431865208,55563.6509348192,111258.50293442 +ENSRNA049454661,175247.926121346,66431.3470812206,24640.9169394865,52083.9146631746,360203.095444512,36189.1459152181,70046.6356539953,85820.9125386666,13968.9789085219,50594.3724297441,25256.2049703724,52152.4232505092 +ENSRNA049454747,117703.830977024,154452.881963838,281610.479308417,29481.4611300988,191500.379856576,152798.616086476,53565.0743236435,14156.0268105017,293348.557078959,155674.99209152,63140.5124259309,243377.975169043 +ENSRNA049454887,2615.6406883783,164417.584026021,28161.0479308417,82548.0911642767,50154.861391008,136714.551235268,97859.270398964,64586.872322914,328271.004350264,159566.866893808,151537.229822234,86920.7054175153 +ENSRNA049454931,177863.566809724,81378.4001744952,235848.776420799,88444.3833902964,18238.131414912,120630.48638406,82407.8066517592,50430.8455124123,118736.320722436,68107.8090400402,232357.085727426,163410.926184929 diff --git a/tests/test_data/input_datasets/microarray.normalised.design.csv b/tests/test_data/input_datasets/microarray.normalised.design.csv new file mode 100644 index 00000000..d31e5cef --- /dev/null +++ b/tests/test_data/input_datasets/microarray.normalised.design.csv @@ -0,0 +1,13 @@ +sample,condition +GSM1528575,g1 +GSM1528576,g1 +GSM1528579,g1 +GSM1528583,g2 +GSM1528584,g2 +GSM1528585,g2 +GSM1528580,g3 +GSM1528586,g3 +GSM1528582,g3 +GSM1528578,g4 +GSM1528581,g4 +GSM1528577,g4 diff --git a/tests/test_data/input_datasets/rnaseq.raw.csv b/tests/test_data/input_datasets/rnaseq.raw.csv new file mode 100644 index 00000000..5688c066 --- /dev/null +++ b/tests/test_data/input_datasets/rnaseq.raw.csv @@ -0,0 +1,10 @@ +gene_id,ESM1528575,ESM1528576,ESM1528579,ESM1528583,ESM1528584,ESM1528585,ESM1528580,ESM1528586,ESM1528582,ESM1528578,ESM1528581,ESM1528577 +ENSRNA049453121,1,82,8,82,4,68,88,73,46,57,25,22 +ENSRNA049453138,68,93,41,84,36,18,28,92,84,85,92,32 +ENSRNA049454388,38,10,0,23,11,17,95,57,25,82,10,70 +ENSRNA049454416,75,55,7,30,79,60,15,97,12,35,60,56 +ENSRNA049454647,35,64,55,91,48,95,68,100,24,26,100,47 +ENSRNA049454661,8,99,80,48,86,29,80,17,19,9,44,2 +ENSRNA049454747,67,7,98,53,3,10,52,87,4,80,22,15 +ENSRNA049454887,8,40,24,90,42,52,79,81,94,23,35,81 +ENSRNA049454931,45,49,67,73,26,76,41,16,34,47,36,25 diff --git a/tests/test_data/input_datasets/rnaseq.raw.design.csv b/tests/test_data/input_datasets/rnaseq.raw.design.csv new file mode 100644 index 00000000..469751d2 --- /dev/null +++ b/tests/test_data/input_datasets/rnaseq.raw.design.csv @@ -0,0 +1,13 @@ +sample,condition +ESM1528575,g1 +ESM1528576,g1 +ESM1528579,g1 +ESM1528583,g2 +ESM1528584,g2 +ESM1528585,g2 +ESM1528580,g3 +ESM1528586,g3 +ESM1528582,g3 +ESM1528578,g4 +ESM1528581,g4 +ESM1528577,g4 diff --git a/tests/test_data/input_datasets/rnaseq_big.design.csv b/tests/test_data/input_datasets/rnaseq_big.design.csv new file mode 100644 index 00000000..e8de12df --- /dev/null +++ b/tests/test_data/input_datasets/rnaseq_big.design.csv @@ -0,0 +1,7 @@ +sample,condition +SRX8042381,control +SRX8042382,control +SRX8042383,control +SRX8042384,treatment +SRX8042385,treatment +SRX8042386,treatment diff --git a/tests/test_data/merge_data/input/counts1.parquet b/tests/test_data/merge_data/input/counts1.parquet new file mode 100644 index 00000000..b4d98e52 Binary files /dev/null and b/tests/test_data/merge_data/input/counts1.parquet differ diff --git a/tests/test_data/merge_data/input/counts2.parquet b/tests/test_data/merge_data/input/counts2.parquet new file mode 100644 index 00000000..e9eca845 Binary files /dev/null and b/tests/test_data/merge_data/input/counts2.parquet differ diff --git a/tests/test_data/merge_data/input/counts3.parquet b/tests/test_data/merge_data/input/counts3.parquet new file mode 100644 index 00000000..af4fa697 Binary files /dev/null and b/tests/test_data/merge_data/input/counts3.parquet differ diff --git a/tests/test_data/merge_data/input/dataset_stat1.csv b/tests/test_data/merge_data/input/dataset_stat1.csv new file mode 100644 index 00000000..feca6c83 --- /dev/null +++ b/tests/test_data/merge_data/input/dataset_stat1.csv @@ -0,0 +1,10 @@ +sample,count,skewness,kolmogorov_smirnov_to_uniform_dist_pvalue +ARR029909,1,1,1 +ARR029910,2,3,1 +ARR029911,3,5,1 +ARR029912,4,4,9 +ARR029913,5,1,5 +ARR029914,6,6,6 +ARR029915,7,1,9 +ARR029916,8,8,1 +ARR029917,9,3,9 diff --git a/tests/test_data/merge_data/input/dataset_stat2.csv b/tests/test_data/merge_data/input/dataset_stat2.csv new file mode 100644 index 00000000..a7c0ea8b --- /dev/null +++ b/tests/test_data/merge_data/input/dataset_stat2.csv @@ -0,0 +1,10 @@ +sample,count,skewness,kolmogorov_smirnov_to_uniform_dist_pvalue +URR029909,1,1,1 +URR029910,2,2,2 +URR029911,3,2,3 +URR029912,4,4,4 +URR029913,5,5,5 +URR029914,6,6,3 +URR029915,7,7,7 +URR029916,8,8,8 +URR029917,9,9,9 diff --git a/tests/test_data/merge_data/input/dataset_stat3.csv b/tests/test_data/merge_data/input/dataset_stat3.csv new file mode 100644 index 00000000..28be6731 --- /dev/null +++ b/tests/test_data/merge_data/input/dataset_stat3.csv @@ -0,0 +1,10 @@ +sample,count,skewness,kolmogorov_smirnov_to_uniform_dist_pvalue +ERR029909,1,1,1 +ERR029910,2,2,2 +ERR029911,3,3,3 +ERR029912,4,9,4 +ERR029913,5,5,5 +ERR029914,6,6,6 +ERR029915,7,7,7 +ERR029916,8,8,1 +ERR029917,9,9,9 diff --git a/tests/test_data/merge_data/input/design1.csv b/tests/test_data/merge_data/input/design1.csv new file mode 100644 index 00000000..f9b61c49 --- /dev/null +++ b/tests/test_data/merge_data/input/design1.csv @@ -0,0 +1,10 @@ +sample,condition +ARR029909,g1 +ARR029910,g1 +ARR029911,g1 +ARR029912,g2 +ARR029913,g2 +ARR029914,g2 +ARR029915,g3 +ARR029916,g3 +ARR029917,g3 diff --git a/tests/test_data/merge_data/input/design2.csv b/tests/test_data/merge_data/input/design2.csv new file mode 100644 index 00000000..dcb29ec8 --- /dev/null +++ b/tests/test_data/merge_data/input/design2.csv @@ -0,0 +1,10 @@ +sample,condition +URR029909,g1 +URR029910,g1 +URR029911,g1 +URR029912,g2 +URR029913,g2 +URR029914,g2 +URR029915,g3 +URR029916,g3 +URR029917,g3 diff --git a/tests/test_data/merge_data/input/design3.csv b/tests/test_data/merge_data/input/design3.csv new file mode 100644 index 00000000..75caca86 --- /dev/null +++ b/tests/test_data/merge_data/input/design3.csv @@ -0,0 +1,10 @@ +batch,sample,condition +batch3,ERR029909,g1 +batch3,ERR029910,g1 +batch3,ERR029911,g1 +batch3,ERR029912,g2 +batch3,ERR029913,g2 +batch3,ERR029914,g2 +batch3,ERR029915,g3 +batch3,ERR029916,g3 +batch3,ERR029917,g3 diff --git a/tests/test_data/merge_data/output/all_counts.csv b/tests/test_data/merge_data/output/all_counts.csv new file mode 100644 index 00000000..527a2205 --- /dev/null +++ b/tests/test_data/merge_data/output/all_counts.csv @@ -0,0 +1,15 @@ +gene_id,URR029909,URR029910,URR029911,URR029912,URR029913,URR029914,URR029915,URR029916,URR029917,ERR029909,ERR029910,ERR029911,ERR029912,ERR029913,ERR029914,ERR029915,ERR029916,ERR029917,ARR029909,ARR029910,ARR029911,ARR029912,ARR029913,ARR029914,ARR029915,ARR029916,ARR029917 +AT1G34790,0.60113057,0.64080682,0.6,0.6197164000000003,0.60115891,0.63052843,0.61002869,0.65849011,0.66239896,0.60113057,0.64080682,0.6348181099999999,0.6519716400000001,0.60115891,0.63052843,0.61002869,0.65849011,0.66239896,0.60113057,0.64080682,0.6348181099999999,0.6519716400000001,0.20115891000000002,0.93052843,0.71002869,0.65849011,0.16239896 +AT5G35550,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.0,0.8336608,0.00340416,0.23179154000000002,0.0,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.8336608,0.00340416,0.23179154000000002,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.8336608,0.00340416,0.23179154000000002 +AT5G23260,0.71122807,0.47981484,0.85599454,0.69023553,0.40420572,0.30220852000000004,0.73996866,0.08559519,0.80013134,0.71122807,0.47981484,0.85599454,0.69023553,0.40420572,0.30220852000000004,0.73996866,0.08559519,0.80013134,0.71122807,0.47981484,0.85599454,0.69023553,0.40420572,0.30220852000000004,0.73996866,0.08559519,0.80013134 +AT1G34791,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.20115891000000002,0.93052843,0.71002869,0.65849011,0.16239896,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.20115891000000002,0.93052843,0.71002869,0.65849011,0.16239896,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.20115891000000002,0.93052843,0.71002869,0.65849011,0.16239896 +AT5G35551,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.0,0.00340416,0.23179154000000002,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.8336608,0.00340416,0.23179154000000002,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.8336608,0.00340416,0.23179154000000002 +AT5G23261,0.71122807,0.47981484,0.85599454,0.69023553,0.40420572,0.30220852000000004,0.73996866,0.08559519,0.80013134,0.71122807,0.47981484,0.85599454,0.69023553,0.40420572,0.30220852000000004,0.73996866,0.08559519,0.80013134,0.71122807,0.47981484,0.85599454,0.69023553,0.40420572,0.30220852000000004,0.73996866,0.08559519,0.80013134 +AT1G34792,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.0,0.93052843,0.71002869,0.65849011,0.16239896,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.20115891000000002,0.93052843,0.71002869,0.65849011,0.16239896,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.20115891000000002,0.93052843,0.71002869,0.65849011,0.16239896 +AT5G35552,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.8336608,0.00340416,0.23179154000000002,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.8336608,0.00340416,0.23179154000000002,0.7148504699999999,0.21713193,0.03318757,0.18404821999999998,0.70246917,0.7555268599999999,0.8336608,0.00340416,0.23179154000000002 +AT5G23262,0.0,0.47981484,0.85599454,0.69023553,0.0,0.30220852000000004,0.73996866,0.08559519,0.80013134,0.71122807,0.47981484,0.85599454,0.0,0.0,0.0,0.73996866,0.0,0.0,0.0,0.0,0.0,0.69023553,0.40420572,0.30220852000000004,0.73996866,0.08559519,0.80013134 +AT1G34793,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.0,0.93052843,0.71002869,0.65849011,0.16239896,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.20115891000000002,0.93052843,0.71002869,0.65849011,0.16239896,0.60113057,0.64080682,0.9348181099999999,0.35197164000000003,0.20115891000000002,0.93052843,0.71002869,0.65849011,0.16239896 +AT5G35553,,0.21713193,,,,,,,,,,,,,0,,,,,0.9348181099999999,,,0.35197164000000003,,,,0.0 +AT5G35554,,0.01713193,,,,,,,,,,,,,0.01,,,,,0.15,,,0.151,,,,0.0114 +AT5G35555,,0.01713193,,,,,0.0,,,,,,,,0.01,,,,,0.0,,,0.151,,,,0.011 +AT5G23263,,,,,,,,,,,,,,,,,,,,,,,,,,, diff --git a/tests/test_data/merge_data/output/all_counts.parquet b/tests/test_data/merge_data/output/all_counts.parquet new file mode 100644 index 00000000..35fb1441 Binary files /dev/null and b/tests/test_data/merge_data/output/all_counts.parquet differ diff --git a/tests/test_data/misc/accessions_to_include.txt b/tests/test_data/misc/accessions_to_include.txt new file mode 100644 index 00000000..7020d409 --- /dev/null +++ b/tests/test_data/misc/accessions_to_include.txt @@ -0,0 +1,2 @@ +E-MTAB-4252 +E-MTAB-4253 diff --git a/tests/test_data/misc/excluded_accessions.txt b/tests/test_data/misc/excluded_accessions.txt new file mode 100644 index 00000000..6c403a93 --- /dev/null +++ b/tests/test_data/misc/excluded_accessions.txt @@ -0,0 +1,2 @@ +E-MTAB-4251 +E-MTAB-4301 diff --git a/tests/test_data/normalisation/base/counts.csv b/tests/test_data/normalisation/base/counts.csv new file mode 100644 index 00000000..ba76be4e --- /dev/null +++ b/tests/test_data/normalisation/base/counts.csv @@ -0,0 +1,13 @@ +,E_MTAB_5038_rnaseq_SRR1586392,E_MTAB_5038_rnaseq_SRR1586393,E_MTAB_5038_rnaseq_SRR1586394,E_MTAB_5038_rnaseq_SRR1586395,E_MTAB_5038_rnaseq_SRR1586396,E_MTAB_5038_rnaseq_SRR1586397,E_MTAB_5038_rnaseq_SRR1586400,E_MTAB_5038_rnaseq_SRR1586401,E_MTAB_5038_rnaseq_SRR1586402 +ENSRNA549434199,14,25,27,47,39,34,38,19,64 +ENSRNA549434200,91,37,78,84,6,51,18,2,57 +ENSRNA549434201,98,48,69,7,73,48,57,92,36 +ENSRNA549434202,52,15,41,19,8,100,85,83,97 +ENSRNA549434203,86,71,53,16,66,23,12,42,33 +ENSRNA549434204,62,2,25,89,74,32,45,56,26 +ENSRNA549434205,98,42,79,76,74,85,3,91,56 +ENSRNA549434206,42,49,4,88,82,34,27,83,98 +ENSRNA549434207,82,93,85,14,38,8,98,97,30 +ENSRNA549434208,72,36,4,60,25,7,14,76,47 +ENSRNA549434209,65,12,99,82,72,52,24,79,31 +ENSRNA549434210,0,0,0,0,0,0,0,0,0 diff --git a/tests/test_data/normalisation/base/counts.tsv b/tests/test_data/normalisation/base/counts.tsv new file mode 100644 index 00000000..17db2d66 --- /dev/null +++ b/tests/test_data/normalisation/base/counts.tsv @@ -0,0 +1,13 @@ + E_MTAB_5038_rnaseq_SRR1586392 E_MTAB_5038_rnaseq_SRR1586393 E_MTAB_5038_rnaseq_SRR1586394 E_MTAB_5038_rnaseq_SRR1586395 E_MTAB_5038_rnaseq_SRR1586396 E_MTAB_5038_rnaseq_SRR1586397 E_MTAB_5038_rnaseq_SRR1586400 E_MTAB_5038_rnaseq_SRR1586401 E_MTAB_5038_rnaseq_SRR1586402 +ENSRNA549434199 14 25 27 47 39 34 38 19 64 +ENSRNA549434200 91 37 78 84 6 51 18 2 57 +ENSRNA549434201 98 48 69 7 73 48 57 92 36 +ENSRNA549434202 52 15 41 19 8 100 85 83 97 +ENSRNA549434203 86 71 53 16 66 23 12 42 33 +ENSRNA549434204 62 2 25 89 74 32 45 56 26 +ENSRNA549434205 98 42 79 76 74 85 3 91 56 +ENSRNA549434206 42 49 4 88 82 34 27 83 98 +ENSRNA549434207 82 93 85 14 38 8 98 97 30 +ENSRNA549434208 72 36 4 60 25 7 14 76 47 +ENSRNA549434209 65 12 99 82 72 52 24 79 31 +ENSRNA549434210 0 0 0 0 0 0 0 0 0 diff --git a/tests/test_data/normalisation/base/design.csv b/tests/test_data/normalisation/base/design.csv new file mode 100644 index 00000000..ef161acb --- /dev/null +++ b/tests/test_data/normalisation/base/design.csv @@ -0,0 +1,10 @@ +batch,condition,sample +E_MTAB_5038_rnaseq,g1,E_MTAB_5038_rnaseq_SRR1586392 +E_MTAB_5038_rnaseq,g1,E_MTAB_5038_rnaseq_SRR1586393 +E_MTAB_5038_rnaseq,g1,E_MTAB_5038_rnaseq_SRR1586394 +E_MTAB_5038_rnaseq,g2,E_MTAB_5038_rnaseq_SRR1586395 +E_MTAB_5038_rnaseq,g2,E_MTAB_5038_rnaseq_SRR1586396 +E_MTAB_5038_rnaseq,g2,E_MTAB_5038_rnaseq_SRR1586397 +E_MTAB_5038_rnaseq,g3,E_MTAB_5038_rnaseq_SRR1586400 +E_MTAB_5038_rnaseq,g3,E_MTAB_5038_rnaseq_SRR1586401 +E_MTAB_5038_rnaseq,g3,E_MTAB_5038_rnaseq_SRR1586402 diff --git a/tests/test_data/normalisation/base/design.tsv b/tests/test_data/normalisation/base/design.tsv new file mode 100644 index 00000000..fca7e731 --- /dev/null +++ b/tests/test_data/normalisation/base/design.tsv @@ -0,0 +1,10 @@ +batch condition sample +E_MTAB_5038_rnaseq g1 E_MTAB_5038_rnaseq_SRR1586392 +E_MTAB_5038_rnaseq g1 E_MTAB_5038_rnaseq_SRR1586393 +E_MTAB_5038_rnaseq g1 E_MTAB_5038_rnaseq_SRR1586394 +E_MTAB_5038_rnaseq g2 E_MTAB_5038_rnaseq_SRR1586395 +E_MTAB_5038_rnaseq g2 E_MTAB_5038_rnaseq_SRR1586396 +E_MTAB_5038_rnaseq g2 E_MTAB_5038_rnaseq_SRR1586397 +E_MTAB_5038_rnaseq g3 E_MTAB_5038_rnaseq_SRR1586400 +E_MTAB_5038_rnaseq g3 E_MTAB_5038_rnaseq_SRR1586401 +E_MTAB_5038_rnaseq g3 E_MTAB_5038_rnaseq_SRR1586402 diff --git a/tests/test_data/normalisation/base/gene_lengths.csv b/tests/test_data/normalisation/base/gene_lengths.csv new file mode 100644 index 00000000..67b05cee --- /dev/null +++ b/tests/test_data/normalisation/base/gene_lengths.csv @@ -0,0 +1,13 @@ +gene_id,length +ENSRNA549434199,100 +ENSRNA549434200,200 +ENSRNA549434201,300 +ENSRNA549434202,400 +ENSRNA549434203,500 +ENSRNA549434204,600 +ENSRNA549434205,700 +ENSRNA549434206,800 +ENSRNA549434207,900 +ENSRNA549434208,1000 +ENSRNA549434209,1100 +ENSRNA549434210,1200 diff --git a/tests/test_data/normalisation/many_zeros/counts.csv b/tests/test_data/normalisation/many_zeros/counts.csv new file mode 100644 index 00000000..261de1aa --- /dev/null +++ b/tests/test_data/normalisation/many_zeros/counts.csv @@ -0,0 +1,6 @@ +,E_CURD_1_rnaseq_ERR274309,E_CURD_1_rnaseq_ERR274310,E_CURD_1_rnaseq_SRR070570,E_CURD_1_rnaseq_SRR070571,E_CURD_1_rnaseq_SRR1001909,E_CURD_1_rnaseq_SRR1001910,E_CURD_1_rnaseq_SRR1019221,E_CURD_1_rnaseq_SRR1046909,E_CURD_1_rnaseq_SRR1046910,E_CURD_1_rnaseq_SRR1105822,E_CURD_1_rnaseq_SRR1105823,E_CURD_1_rnaseq_SRR1106559,E_CURD_1_rnaseq_SRR1159821,E_CURD_1_rnaseq_SRR1159827,E_CURD_1_rnaseq_SRR1159831,E_CURD_1_rnaseq_SRR1159837,E_CURD_1_rnaseq_SRR949993 +AT1G80990,0,0,1,0,1,1,0,0,1,1,3,0,0,1,1,1,0 +AT2G01008,11,24,3,4,6,4,0,0,2,0,0,1,4,2,4,4,0 +AT2G01010,9,1,195,195,8,33,0,14,7,0,0,2,1,0,0,0,0 +AT2G01020,34,27,41,55,58,107,2,10,20,1,3,1,4,2,3,0,0 +AT2G01021,22,10,0,0,0,0,0,106,20,0,0,1,0,0,0,0,0 diff --git a/tests/test_data/normalisation/many_zeros/design.csv b/tests/test_data/normalisation/many_zeros/design.csv new file mode 100644 index 00000000..a6473d3a --- /dev/null +++ b/tests/test_data/normalisation/many_zeros/design.csv @@ -0,0 +1,18 @@ +batch,condition,sample +E_CURD_1_rnaseq,g2,E_CURD_1_rnaseq_ERR274309 +E_CURD_1_rnaseq,g3,E_CURD_1_rnaseq_ERR274310 +E_CURD_1_rnaseq,g23,E_CURD_1_rnaseq_SRR070570 +E_CURD_1_rnaseq,g23,E_CURD_1_rnaseq_SRR070571 +E_CURD_1_rnaseq,g55,E_CURD_1_rnaseq_SRR1001909 +E_CURD_1_rnaseq,g55,E_CURD_1_rnaseq_SRR1001910 +E_CURD_1_rnaseq,g56,E_CURD_1_rnaseq_SRR1019221 +E_CURD_1_rnaseq,g48,E_CURD_1_rnaseq_SRR1046909 +E_CURD_1_rnaseq,g48,E_CURD_1_rnaseq_SRR1046910 +E_CURD_1_rnaseq,g50,E_CURD_1_rnaseq_SRR1105822 +E_CURD_1_rnaseq,g50,E_CURD_1_rnaseq_SRR1105823 +E_CURD_1_rnaseq,g50,E_CURD_1_rnaseq_SRR1106559 +E_CURD_1_rnaseq,g6,E_CURD_1_rnaseq_SRR1159821 +E_CURD_1_rnaseq,g6,E_CURD_1_rnaseq_SRR1159827 +E_CURD_1_rnaseq,g6,E_CURD_1_rnaseq_SRR1159831 +E_CURD_1_rnaseq,g6,E_CURD_1_rnaseq_SRR1159837 +E_CURD_1_rnaseq,g44,E_CURD_1_rnaseq_SRR949993 diff --git a/tests/test_data/normalisation/many_zeros/gene_lengths.csv b/tests/test_data/normalisation/many_zeros/gene_lengths.csv new file mode 100644 index 00000000..923e2d65 --- /dev/null +++ b/tests/test_data/normalisation/many_zeros/gene_lengths.csv @@ -0,0 +1,6 @@ +gene_id,length +AT1G80990,100 +AT2G01008,200 +AT2G01010,300 +AT2G01020,400 +AT2G01021,500 diff --git a/tests/test_data/normalisation/one_group/counts.csv b/tests/test_data/normalisation/one_group/counts.csv new file mode 100644 index 00000000..0ec999c9 --- /dev/null +++ b/tests/test_data/normalisation/one_group/counts.csv @@ -0,0 +1,6 @@ +sampleA,sampleB,sampleC,sampleD +ENSG00000000003,14,4,4,10 +ENSG00000000005,0,0,0,0 +ENSG00000000419,562,584,523,616 +ENSG00000000457,586,377,207,491 +ENSG00000000460,130,55,28,77 diff --git a/tests/test_data/normalisation/one_group/design.csv b/tests/test_data/normalisation/one_group/design.csv new file mode 100644 index 00000000..9aaadb1d --- /dev/null +++ b/tests/test_data/normalisation/one_group/design.csv @@ -0,0 +1,5 @@ +batch,condition,sample +batch1,g1,sampleA +batch1,g1,sampleB +batch1,g1,sampleC +batch1,g1,sampleD diff --git a/tests/test_data/normalisation/one_group/gene_lengths.csv b/tests/test_data/normalisation/one_group/gene_lengths.csv new file mode 100644 index 00000000..73eb9655 --- /dev/null +++ b/tests/test_data/normalisation/one_group/gene_lengths.csv @@ -0,0 +1,6 @@ +gene_id,length +ENSG00000000003,100 +ENSG00000000005,200 +ENSG00000000419,300 +ENSG00000000457,400 +ENSG00000000460,500 diff --git a/tests/test_data/normfinder/small_normalised/all_counts.normalised.parquet b/tests/test_data/normfinder/small_normalised/all_counts.normalised.parquet new file mode 100644 index 00000000..bc192ae3 Binary files /dev/null and b/tests/test_data/normfinder/small_normalised/all_counts.normalised.parquet differ diff --git a/tests/test_data/normfinder/small_normalised/design.csv b/tests/test_data/normfinder/small_normalised/design.csv new file mode 100644 index 00000000..6a212658 --- /dev/null +++ b/tests/test_data/normfinder/small_normalised/design.csv @@ -0,0 +1,12 @@ +batch,condition,sample +E_MTAB_11876_rnaseq,g1,E_MTAB_11876_rnaseq_ERR9883576 +E_MTAB_11876_rnaseq,g1,E_MTAB_11876_rnaseq_ERR9883577 +E_MTAB_11876_rnaseq,g1,E_MTAB_11876_rnaseq_ERR9883578 +E_MTAB_11876_rnaseq,g2,E_MTAB_11876_rnaseq_ERR9883579 +E_MTAB_11876_rnaseq,g2,E_MTAB_11876_rnaseq_ERR9883580 +E_MTAB_11876_rnaseq,g2,E_MTAB_11876_rnaseq_ERR9883581 +E_MTAB_4789_rnaseq,g8,E_MTAB_4789_rnaseq_SRR948460 +E_MTAB_4789_rnaseq,g8,E_MTAB_4789_rnaseq_SRR948461 +E_MTAB_4789_rnaseq,g8,E_MTAB_4789_rnaseq_SRR948462 +E_MTAB_4789_rnaseq,g8,E_MTAB_4789_rnaseq_SRR948463 +E_MTAB_4789_rnaseq,g9,E_MTAB_4789_rnaseq_SRR948464 diff --git a/tests/test_data/normfinder/very_small_cq/all_counts.normalised.parquet b/tests/test_data/normfinder/very_small_cq/all_counts.normalised.parquet new file mode 100644 index 00000000..5e31470f Binary files /dev/null and b/tests/test_data/normfinder/very_small_cq/all_counts.normalised.parquet differ diff --git a/tests/test_data/normfinder/very_small_cq/design.csv b/tests/test_data/normfinder/very_small_cq/design.csv new file mode 100644 index 00000000..221601a5 --- /dev/null +++ b/tests/test_data/normfinder/very_small_cq/design.csv @@ -0,0 +1,7 @@ +sample,condition,batch +S1,control,A +S2,treated,A +S3,control,A +S4,treated,A +S5,control,A +S6,treated,A diff --git a/tests/test_data/normfinder/very_small_cq/normfinder.R b/tests/test_data/normfinder/very_small_cq/normfinder.R new file mode 100644 index 00000000..f415f95f --- /dev/null +++ b/tests/test_data/normfinder/very_small_cq/normfinder.R @@ -0,0 +1,298 @@ +library(optparse) +library(dplyr) +library(tidyr) + + +get_args <- function() { + option_list <- list( + make_option("--data", type = "character") + ) + + args <- parse_args(OptionParser( + option_list = option_list, + description = "Normfinder" + )) + return(args) +} + + +normfinder<-function(data, group = TRUE, ctVal=FALSE, pStabLim=0.3, sample = "sample", gene = "gene", groups = "group", cq = "cq"){ + + # Group & sample ID + sample_group <- unique(data[,c(sample, groups)]) + + tmp <- data.frame(sample = as.character(data[, sample]), + gene = as.character(data[, gene]), + cq = as.numeric(data[, cq])) + tmp <- tmp %>% + dplyr::group_by(sample, gene) %>% + dplyr::summarise(cq=mean(cq, na.rm=T)) %>% + tidyr::spread(sample, cq) + + ntotal<-length(sample_group[,1]) + + if (group == TRUE){ + ngenes <- length(tmp$gene) # number of genes + genenames <- as.character(tmp$gene) + grId <- factor(sample_group[,2]) + } else { + ngenes <- length(tmp$gene) # number of genes + genenames <- as.character(tmp$gene) + grId <- rep(1,ntotal) + } + + tmp <- data.matrix(tmp[,sample_group[,1]]) + + if (!ctVal){tmp<-log2(tmp)} + + + groupnames <- levels(grId) + ngr <- length(levels(grId)) + + # Number of samples in each group: + nsamples <- rep(0,ngr) + for (group in 1:ngr){nsamples[group] <- sum(grId==groupnames[group])} + + + + MakeStab <- function(da){ + + ngenes <- dim(da)[1] + # Sample averages + sampleavg <- apply(da,2,mean) + # Gene averages within group + genegroupavg <- matrix(0,ngenes,ngr) + + for (group in 1:ngr){ + genegroupavg[,group] <- apply(da[,grId==groupnames[group]],1,mean)} + + # Group averages + groupavg=rep(0,ngr) + for (group in 1:ngr){groupavg[group] <- mean(da[,grId==groupnames[group]])} + + # Variances + GGvar=matrix(0,ngenes,ngr) + for (group in 1:ngr){ + grset <- (grId==groupnames[group]) + a=rep(0,ngenes) + for (gene in 1:ngenes){ + a[gene] <- sum((da[gene,grset]-genegroupavg[gene,group]- + sampleavg[grset]+groupavg[group])^2)/(nsamples[group]-1) + } + GGvar[,group] <- (a-sum(a)/(ngenes*ngenes-ngenes))/(1-2/ngenes) + } + + print("GGvar") + print(GGvar) + + # + # Change possible negative values + genegroupMinvar <- matrix(0, ngenes, ngr) + for (group in 1:ngr){ + grset <- (grId == groupnames[group]) + z <- da[,grset] + for (gene in 1:ngenes){ + varpair <- rep(0,ngenes) + for (gene1 in 1:ngenes){varpair[gene1] <- var(z[gene,] - z[gene1,])} + genegroupMinvar[gene,group] <- min(varpair[-gene])/4 + } + } + # + # Final variances + GGvar <- ifelse(GGvar < 0, genegroupMinvar, GGvar) + print("GGvar") + print(GGvar) + # + # Old stability measure for each gene is calculated: + # + dif <- genegroupavg + difgeneavg <- apply(dif, 1, mean) + difgroupavg <- apply(dif, 2, mean) + difavg <- mean(dif) + for (gene in 1:ngenes){ + for (group in 1:ngr){ + dif[gene,group] <- dif[gene, group] - difgeneavg[gene] - difgroupavg[group] + difavg + } + } + # + nsampMatrix <- matrix(rep(nsamples,ngenes),ngenes,ngr,byrow=T) + vardif <- GGvar/nsampMatrix + gamma <- sum(dif * dif) / ((ngr-1) * (ngenes-1)) -sum (vardif) / (ngenes*ngr) + gamma <- ifelse(gamma<0,0,gamma) + # + difnew <- dif * gamma / (gamma+vardif) + varnew <- vardif + gamma * vardif / (gamma+vardif) + Ostab0 <- abs(difnew) + sqrt(varnew) + Ostab <- apply(Ostab0, 1, mean) + + # + # Measure of group differences: + mud <- rep(0,ngenes) + for (gene in 1:ngenes){ + mud[gene] <- 2*max(abs(dif[gene,])) + } + # Common variance: + genevar <- rep(0,ngenes) + for (gene in 1:ngenes){ + genevar[gene] <- sum((nsamples-1) * GGvar[gene,]) / (sum(nsamples)-ngr) + } + Gsd <- sqrt(genevar) + # + # Return results: + # + return(cbind(mud, Gsd, Ostab, rep(gamma,ngenes), GGvar,dif)) + } # End of function MakeStab + # + # + MakeComb2 <- function(g1, g2, res){ + gam <- res[1,4] + d1 <- res[g1,(4 + ngr + 1):(4 + ngr + ngr)]; d2 <- res[g2, (4 + ngr + 1):(4+ngr+ngr)] + s1 <- res[g1, (4+1):(4+ngr)]; s2 <- res[g2, (4+1):(4+ngr)] + rho <- abs(gam * d1 / (gam + s1 / nsamples) + gam * d2 / (gam + s2 / nsamples)) * sqrt(ngenes / (ngenes-2)) / 2 + rho <- rho + sqrt(s1 / nsamples + gam * s1 / (nsamples*gam+s1) + s2 / nsamples + gam * s2 / (nsamples*gam+s2))/2 + return(mean(rho)) + } + # + # + MakeStabOne <- function(da){ + ngenes <- dim(da)[1] + # Sample averages + sampleavg <- apply(da, 2, mean) + # Gene averages + geneavg <- apply(da, 1, mean) + totalavg <- mean(da) + # + # Variances + genevar0 <- rep(0, ngenes) + for (gene in 1:ngenes){ + genevar0[gene] <- sum((tmp[gene,] - geneavg[gene] - sampleavg + totalavg)^2) / ((ntotal-1) * (1-2/ngenes)) + } + genevar <- genevar0 - sum(genevar0) / (ngenes*ngenes-ngenes) + # + # Change possible negative values + geneMinvar <- rep(0,ngenes) + z <- da + for (gene in 1:ngenes){ + varpair <- rep(0, ngenes) + for (gene1 in 1:ngenes){varpair[gene1] <- var(z[gene,] - z[gene1,])} + geneMinvar[gene] <- min(varpair[-gene]) / 4 + } + # Final variances + genevar = ifelse(genevar<0, geneMinvar, genevar) + # + return(genevar) + } + # End of function MakeStabOne + + #### Main function #### + if (ngr>1){ # More than one group. + # + res <- MakeStab(tmp) + # + gcand <- c(1:ngenes)[res[,3] < pStabLim] + ncand <- length(gcand) + if (ncand<4){ + if (ngenes>3){ + li <- sort(res[,3])[4] + gcand <- c(1:ngenes)[res[,3]<=li] + ncand <- length(gcand) + } else { + gcand <- c(1:ngenes) + ncand <- length(gcand) + } + } + # + vv2 <- c() + # + for (g1 in 1:(ncand-1)){ + for (g2 in (g1+1):ncand){ + qmeas <- MakeComb2(gcand[g1], gcand[g2], res) + vv2 <- rbind(vv2, c(gcand[g1], gcand[g2], qmeas)) + }} + # + ord <- order(res[,3]) + FinalRes <- list(Ordered <- data.frame("GroupDif" = round(res[ord,1],3), + "GroupSD" = round(res[ord,2],3), + "Stability" = round(res[ord,3],3), + row.names = genenames[ord]), + UnOrdered <- data.frame("GroupDif" = round(res[,1],3), + "GroupSD" = round(res[,2],3), + "Stability" = round(res[,3],3), + "IGroupSD" = round(sqrt(res[,(4+1):(4+ngr)]),3), + "IGroupDif" = round(res[,(4+ngr+1):(4+ngr+ngr)],3), + row.names = genenames), + PairOfGenes <- data.frame("Gene1" = genenames[vv2[,1]], + "Gene2" = genenames[vv2[,2]], + "Stability" = round(vv2[,3],3))) + # + return(FinalRes) + # + } else { # End of more than one group: next is for one group only. + # + # + sigma <- sqrt(MakeStabOne(tmp)) + # + siglim <- (min(sigma)+0.1) + gcand <- c(1:ngenes)[sigma=2) & (ngenes>3)){ + # + vv2=c() + # + for (g1 in 1:(ncand-1)){ + for (g2 in (g1+1):ncand){ + dat1 <- rbind(tmp[-c(gcand[g1], gcand[g2]),], + apply(tmp[c(gcand[g1], gcand[g2]),], 2, mean)) + qmeas <- sqrt(MakeStabOne(dat1)) + vv2 <- rbind(vv2, c(gcand[g1], gcand[g2], qmeas[ngenes-1])) + }} + ord <- order(sigma) + FinalRes <- list(Ordered <- data.frame("GroupSD" = round(sigma[ord],3), + row.names = genenames[ord]), + PairOfGenes <- data.frame("Gene1" = genenames[vv2[,1]], + "Gene2" = genenames[vv2[,2]], + "GroupSD" = round(vv2[,3],3))) + } else { # No combined genes to consider + ord <- order(sigma) + FinalRes <- list(Ordered <- data.frame("GroupSD" = round(sigma[ord],3), + row.names = genenames[ord])) + } # End ncand<2 or ngenes<=3 + # + return(FinalRes) + # + } # End one group only + +} ##### + +# Read the counts file +counts <- read.csv("all_counts.normfinder.csv") + +# Build design (conditions per sample) +design <- data.frame( + sample = c("S1","S2","S3","S4","S5","S6"), + group = c("control","treated","control","treated","control","treated") +) + +# Convert counts wide → long +library(tidyr) +library(dplyr) + +data <- counts %>% + tidyr::pivot_longer( + cols = -gene_id, + names_to = "sample", + values_to = "cq" + ) %>% + dplyr::rename(gene = gene_id) %>% + dplyr::left_join(design, by = "sample") + +# Inspect +#print(data) + +data <- as.data.frame(data) + + +res = normfinder(data, ctVal=TRUE) +print("res") +print(res) diff --git a/tests/test_data/public_accessions/exclude_one_geo_accession.txt b/tests/test_data/public_accessions/exclude_one_geo_accession.txt new file mode 100644 index 00000000..c6978b9b --- /dev/null +++ b/tests/test_data/public_accessions/exclude_one_geo_accession.txt @@ -0,0 +1 @@ +GSE55951 diff --git a/tests/test_data/public_accessions/exclude_two_geo_accessions.txt b/tests/test_data/public_accessions/exclude_two_geo_accessions.txt new file mode 100644 index 00000000..0ef19a43 --- /dev/null +++ b/tests/test_data/public_accessions/exclude_two_geo_accessions.txt @@ -0,0 +1,2 @@ +GSE79526 +GSE55951 diff --git a/tests/test_data/quantile_normalisation/count.raw.cpm.csv b/tests/test_data/quantile_normalisation/count.raw.cpm.csv new file mode 100644 index 00000000..e8ecde05 --- /dev/null +++ b/tests/test_data/quantile_normalisation/count.raw.cpm.csv @@ -0,0 +1,10 @@ +,sample_63,sample_64,sample_65,sample_66,sample_67,sample_68,sample_69,sample_70 +ENSRNA049454747,9.07095165125094,56.5509090498679,12.6897789869867,15.7656784862991,4.55005160208214,5.21967362537592,8.87627280506172,6.33326316409849 +ENSRNA049454887,0.740485849081709,1.66326203087847,0.229679257683017,0.785665040845472,2.20608562525195,2.37257892062542,0.365278716257684,0.139192597013154 +ENSRNA049454931,1.20328950475778,2.61369747709473,0.574198144207542,1.46657474291155,2.0682052736737,3.32161048887559,0.620973817638062,0.591568537305903 +ENSRNA049454947,1.48097169816342,2.1384797539866,0.459358515366033,1.57133008169094,2.89548738314318,3.08435259681304,0.474862331134989,0.452375940292749 +ENSRNA049454955,1.29585023589299,2.61369747709473,0.516778329786788,1.09993105718366,3.8606498441909,4.03338416506321,0.584445946012294,0.452375940292749 +ENSRNA049454963,1.38841096702821,4.51456836952727,1.43549536051885,2.7236388082643,4.96369265681688,5.45693151743846,1.35153125015343,1.25273337311838 +ENSRNA049454974,1.66609316043385,3.564132923311,2.52647183451318,2.46175046131581,5.51521406312986,12.5746682793147,1.71680996641111,1.53111856714469 +ENSRNA049455639,0.185121462270427,0.237608861554067,0.803877401890558,1.15230872657336,0.137880351578247,0.237257892062542,0.438334459509221,0.417577791039461 +ENSRNA049455690,0.0925607311352137,1.18804430777033,0.746457587469804,2.98552715521279,0.137880351578247,0.237257892062542,0.876668919018441,0.487174089546038 diff --git a/tower.yml b/tower.yml index 787aedfe..c61323c0 100644 --- a/tower.yml +++ b/tower.yml @@ -1,5 +1,3 @@ reports: - multiqc_report.html: - display: "MultiQC HTML report" samplesheet.csv: display: "Auto-created samplesheet with collated metadata and FASTQ paths" diff --git a/workflows/stableexpression.nf b/workflows/stableexpression.nf index c6494c50..a823a320 100644 --- a/workflows/stableexpression.nf +++ b/workflows/stableexpression.nf @@ -3,11 +3,19 @@ IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { paramsSummaryMap } from 'plugin/nf-schema' -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_stableexpression_pipeline' + +include { GET_PUBLIC_ACCESSIONS } from '../subworkflows/local/get_public_accessions' +include { DOWNLOAD_PUBLIC_DATASETS } from '../subworkflows/local/download_public_datasets' +include { ID_MAPPING } from '../subworkflows/local/idmapping' +include { SAMPLE_FILTERING } from '../subworkflows/local/sample_filtering' +include { EXPRESSION_NORMALISATION } from '../subworkflows/local/expression_normalisation' +include { DATASET_ANALYSIS } from '../subworkflows/local/dataset_analysis' +include { MERGE_DATA } from '../subworkflows/local/merge_data' +include { GENE_STATISTICS } from '../subworkflows/local/gene_statistics' +include { STABILITY_SCORING } from '../subworkflows/local/stability_scoring' +include { REPORTING } from '../subworkflows/local/reporting' + +include { checkCounts } from '../subworkflows/local/utils_nfcore_stableexpression_pipeline' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -18,84 +26,196 @@ include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_stab workflow STABLEEXPRESSION { take: - ch_samplesheet // channel: samplesheet read in from --input + ch_input_datasets + + main: - ch_versions = channel.empty() - ch_multiqc_files = channel.empty() - - // - // Collate and save software versions - // - def topic_versions = Channel.topic("versions") - .distinct() - .branch { entry -> - versions_file: entry instanceof Path - versions_tuple: true - } - - def topic_versions_string = topic_versions.versions_tuple - .map { process, tool, version -> - [ process[process.lastIndexOf(':')+1..-1], " ${tool}: ${version}" ] - } - .groupTuple(by:0) - .map { process, tool_versions -> - tool_versions.unique().sort() - "${process}:\n${tool_versions.join('\n')}" - } - - softwareVersionsToYAML(ch_versions.mix(topic_versions.versions_file)) - .mix(topic_versions_string) - .collectFile( - storeDir: "${params.outdir}/pipeline_info", - name: 'nf_core_' + 'stableexpression_software_' + 'mqc_' + 'versions.yml', - sort: true, - newLine: true - ).set { ch_collated_versions } - - - // - // MODULE: MultiQC - // - ch_multiqc_config = channel.fromPath( - "$projectDir/assets/multiqc_config.yml", checkIfExists: true) - ch_multiqc_custom_config = params.multiqc_config ? - channel.fromPath(params.multiqc_config, checkIfExists: true) : - channel.empty() - ch_multiqc_logo = params.multiqc_logo ? - channel.fromPath(params.multiqc_logo, checkIfExists: true) : - channel.empty() - - summary_params = paramsSummaryMap( - workflow, parameters_schema: "nextflow_schema.json") - ch_workflow_summary = channel.value(paramsSummaryMultiqc(summary_params)) - ch_multiqc_files = ch_multiqc_files.mix( - ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_custom_methods_description = params.multiqc_methods_description ? - file(params.multiqc_methods_description, checkIfExists: true) : - file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - ch_methods_description = channel.value( - methodsDescriptionText(ch_multiqc_custom_methods_description)) - - ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) - ch_multiqc_files = ch_multiqc_files.mix( - ch_methods_description.collectFile( - name: 'methods_description_mqc.yaml', - sort: true + ch_accessions = channel.empty() + ch_downloaded_datasets = channel.empty() + ch_all_counts = channel.empty() + ch_all_imputed_counts = channel.empty() + ch_whole_design = channel.empty() + ch_stats_all_genes_with_scores = channel.empty() + ch_platform_statistics = channel.empty() + ch_whole_gene_metadata = channel.empty() + ch_whole_gene_id_mapping = channel.empty() + + def species = params.species.split(' ').join('_').toLowerCase() + + // ----------------------------------------------------------------- + // FETCH PUBLIC ACCESSIONS + // ----------------------------------------------------------------- + + GET_PUBLIC_ACCESSIONS( + species, + params.skip_fetch_eatlas_accessions, + params.fetch_geo_accessions, + params.platform, + params.keywords, + channel.fromList( params.accessions.tokenize(',') ), + params.accessions_file ? channel.fromPath(params.accessions_file, checkIfExists: true) : channel.empty(), + channel.fromList( params.excluded_accessions.tokenize(',') ), + params.excluded_accessions_file ? channel.fromPath(params.excluded_accessions_file, checkIfExists: true) : channel.empty(), + params.random_sampling_size, + params.random_sampling_seed, + params.outdir + ) + + ch_accessions = GET_PUBLIC_ACCESSIONS.out.accessions + + // ----------------------------------------------------------------- + // DOWNLOAD GEO DATASETS IF NEEDED + // ----------------------------------------------------------------- + + if ( !params.accessions_only) { + + DOWNLOAD_PUBLIC_DATASETS ( + species, + ch_accessions + ) + + ch_downloaded_datasets = DOWNLOAD_PUBLIC_DATASETS.out.datasets + + } + + if ( !params.accessions_only && !params.download_only ) { + + ch_counts = ch_input_datasets.mix( ch_downloaded_datasets ) + // returns an error with a message if no dataset was found + checkCounts( ch_counts, params.fetch_geo_accessions ) + + // ----------------------------------------------------------------- + // IDMAPPING + // ----------------------------------------------------------------- + + // tries to map gene IDs to Ensembl IDs whenever possible + ID_MAPPING( + ch_counts, + species, + params.skip_id_mapping, + params.skip_cleaning_gene_ids, + params.gprofiler_target_db, + params.gene_id_mapping, + params.gene_metadata, + params.min_occurrence_freq, + params.min_occurrence_quantile, + params.outdir + ) + + ch_counts = ID_MAPPING.out.counts + ch_whole_gene_id_mapping = ID_MAPPING.out.mapping + ch_whole_gene_metadata = ID_MAPPING.out.metadata + ch_valid_gene_ids = ID_MAPPING.out.valid_gene_ids + + // ----------------------------------------------------------------- + // FILTER OUT SAMPLES NOT VALID + // ----------------------------------------------------------------- + + SAMPLE_FILTERING ( + ch_counts, + ch_valid_gene_ids, + params.max_zero_ratio, + params.max_null_ratio, + params.outdir ) - ) - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList(), - [], - [] + ch_ratio_nulls_per_sample_file = SAMPLE_FILTERING.out.ratio_nulls_per_sample_file + + // ----------------------------------------------------------------- + // NORMALISATION OF RAW COUNT DATASETS (INCLUDING RNA-SEQ DATASETS) + // ----------------------------------------------------------------- + + EXPRESSION_NORMALISATION( + species, + SAMPLE_FILTERING.out.counts, + params.normalisation_method, + params.quantile_norm_target_distrib, + params.gff, + params.gene_length + ) + + ch_normalised_counts = EXPRESSION_NORMALISATION.out.counts + + // ----------------------------------------------------------------- + // ANALYSIS OF NORMALISED DATASETS + // ----------------------------------------------------------------- + + DATASET_ANALYSIS( + ch_normalised_counts + ) + + // ----------------------------------------------------------------- + // MERGE ALL DATASETS INTO ONE SINGLE DATASET + // ----------------------------------------------------------------- + + MERGE_DATA ( + ch_normalised_counts, + params.missing_value_imputer, + params.outdir + ) + + ch_all_imputed_counts = MERGE_DATA.out.all_imputed_counts + ch_all_counts = MERGE_DATA.out.all_counts + ch_whole_design = MERGE_DATA.out.whole_design + ch_platform_counts = MERGE_DATA.out.platform_counts + + // ----------------------------------------------------------------- + // COMPUTE BASE STATISTICS FOR ALL GENES + // ----------------------------------------------------------------- + + GENE_STATISTICS ( + ch_all_imputed_counts, + ch_all_counts, + ch_platform_counts, + ch_ratio_nulls_per_sample_file, + params.max_null_ratio_valid_sample + ) + + ch_all_datasets_stats = GENE_STATISTICS.out.stats + ch_platform_statistics = GENE_STATISTICS.out.platform_stats + + // ----------------------------------------------------------------- + // GET CANDIDATES AS REFERENCE GENE AND COMPUTES VARIOUS STABILITY VALUES + // ----------------------------------------------------------------- + + STABILITY_SCORING ( + ch_all_imputed_counts.map{ meta, file -> file }, + ch_whole_design, + ch_all_datasets_stats, + params.nb_candidates_per_section, + params.nb_sections, + params.skip_genorm, + params.stability_score_weights + ) + + ch_stats_all_genes_with_scores = STABILITY_SCORING.out.summary_statistics + + } + + // ----------------------------------------------------------------- + // REPORTING + // ----------------------------------------------------------------- + + REPORTING( + ch_all_imputed_counts, + ch_whole_design, + ch_stats_all_genes_with_scores, + ch_platform_statistics, + ch_whole_gene_metadata, + ch_whole_gene_id_mapping, + params.target_genes, + params.target_gene_file, + params.multiqc_config, + params.multiqc_logo, + params.multiqc_methods_description, + params.outdir ) - emit:multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html - versions = ch_versions // channel: [ path(versions.yml) ] + + emit: + multiqc_report = REPORTING.out.multiqc_report.toList() + all_genes_summary = REPORTING.out.all_genes_summary }