Merge pull request #117 from nf-core/exclude-overlong-transcripts

Fix some bugs and add upset plots
nf-core · May 10, 2024 · 5ddc061 · 5ddc061
2 parents 0f10c05 + 86af7b2
commit 5ddc061
Show file tree

Hide file tree

Showing 13 changed files with 252 additions and 94 deletions.
diff --git a/bin/DEA.R b/bin/DEA.R
@@ -283,7 +283,7 @@ DESeq2 <- function(inputdata, data_type){
         rm(tmp)
 
         dds <- DESeqDataSetFromMatrix(
-        countData=inputdata$circ,
+        countData=round(inputdata$circ),
         colData=inputdata$pheno,
         design = inputdata$design)
 

diff --git a/bin/counts_combined.py b/bin/counts_combined.py
@@ -11,20 +11,18 @@
 
 args = parser.parse_args()
 
-columns = ['chr', 'start', 'end', 'strand', 'count', 'tools']
+columns = ['chr', 'start', 'end', 'name', 'count', 'strand']
 dfs = {os.path.basename(bed).split('.')[0]: pd.read_csv(bed,
                    sep='\t',
                    header=None,
-                   names=columns,
-                   index_col=[0, 1, 2, 3])
-                   .drop('tools', axis=1) for bed in args.beds}
+                   index_col=["chr", "start", "end", "strand"],
+                   usecols=["chr", "start", "end", "strand", "count"],
+                   names=columns) for bed in args.beds}
 
 dfs = [df.rename(columns={'count': sample}) for sample, df in dfs.items()]
-
 df = pd.concat(dfs, axis=1)
-df = df.fillna(0).astype(int)
-
-df.to_csv(args.out_bed, sep='\t')
+df = df.fillna(0)
+df.to_csv(args.out_bed, sep='\t', header=True, index=True)
 
 df.index = df.index.map(lambda x: f'{x[0]}:{x[1]}-{x[2]}:{x[3]}')
 df.index.name = 'ID'

diff --git a/bin/merge_tools.py b/bin/merge_tools.py
@@ -21,4 +21,9 @@
                                                         'tool_count': 'sum'}).reset_index()
 df = df[df['tool_count'] >= args.tool_filter]
 
+df.drop('tool_count', axis=1, inplace=True)
+df["name"] = df["chr"] + ":" + df["start"].astype(str) + "-" + df["end"].astype(str) + ":" + df["strand"]
+
+df = df[['chr', 'start', 'end', 'name', 'count', 'strand']]
+
 df.to_csv(args.output, sep='\t', index=False, header=False)
diff --git a/conf/modules.config b/conf/modules.config
@@ -562,6 +562,14 @@ if (!params.skip_trimming) {
         ]
     }
 
+    withName: UPSET_SAMPLES {
+        ext.when = { params.tool.split(',').length > 1 }
+    }
+
+    withName: UPSET_ALL {
+        ext.when = { params.tool.split(',').length > 1 }
+    }
+
     withName: INTERSECT_ANNOTATION {
         ext.args = "-loj"
         ext.suffix = "intersect.bed"
@@ -601,6 +609,11 @@ if (!params.skip_trimming) {
         ext.suffix = "gtf"
     }
 
+    withName: EXCLUDE_OVERLONG_TRANSCRIPTS {
+        ext.args = "-v FS='\\t' -v OFS='\\t' '\$5-\$4 <= 10000 { print }'"
+        ext.suffix = "filtered.gtf"
+    }
+
     withName: MARK_CIRCULAR {
         // GAWK process that marks FASTA headers.
         // Leaves headers starting with "ENS" and non-header lines as is.

diff --git a/modules/local/annotation/full_annotation/main.nf b/modules/local/annotation/full_annotation/main.nf
@@ -1,5 +1,5 @@
 process ANNOTATION {
-    tag "$meta.id"
+    tag "$meta.id:$meta.tool"
     label 'process_single'
 
     conda "bioconda::pandas=1.5.2"

diff --git a/modules/local/quantification/merge_experiments/environment.yml b/modules/local/quantification/merge_experiments/environment.yml
@@ -6,4 +6,4 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - "bioconda::bioconductor-summarizedexperiment=1.32.0"
+  - "bioconda::bioconductor-rtracklayer==1.62.0--r43ha9d7317_0"
diff --git a/modules/local/quantification/merge_experiments/main.nf b/modules/local/quantification/merge_experiments/main.nf
@@ -4,12 +4,14 @@ process MERGE_EXPERIMENTS {
 
     conda "${moduleDir}/environment.yaml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/bioconductor-summarizedexperiment:1.32.0--r43hdfd78af_0' :
-        'biocontainers/bioconductor-summarizedexperiment:1.32.0--r43hdfd78af_0' }"
+        'https://depot.galaxyproject.org/singularity/bioconductor-rtracklayer:1.62.0--r43ha9d7317_0' :
+        'biocontainers/bioconductor-rtracklayer:1.62.0--r43ha9d7317_0' }"
 
     input:
-    tuple val(meta), path(experiments)
+    tuple val(meta),  path(experiments)
     tuple val(meta2), path(phenotype)
+    tuple val(meta3), path(gtf)
+    tuple val(meta4), path(tpm)
 
     output:
     tuple val(meta), path("${meta.id}.merged.rds"), emit: merged

diff --git a/modules/local/quantification/merge_experiments/templates/merge_experiments.r b/modules/local/quantification/merge_experiments/templates/merge_experiments.r
@@ -6,6 +6,8 @@ paths <- c('${experiments.join("\', \'")}')
 experiments <- lapply(paths, readRDS)
 
 phenotype <- read.csv('${phenotype}', stringsAsFactors = FALSE)
+annotation <- rtracklayer::import('${gtf}')
+tpm <- read.table('${tpm}', header=TRUE, row.names=1)[, -1]
 
 se_assays <- list()
 
@@ -37,6 +39,13 @@ for (col in colnames(colData(se))) {
     }
 }
 
+# Add transcript annotation
+annotation <- annotation[match(rownames(se), annotation\$transcript_id),]
+rowData(se) <- annotation
+
+# Add TPM
+assay(se, "tpm", withDimnames = FALSE) <- tpm[rownames(se), colData(se)\$names]
+
 saveRDS(se, '${meta.id}.merged.rds')
 
 writeLines(

diff --git a/modules/local/upset/main.nf b/modules/local/upset/main.nf
@@ -0,0 +1,22 @@
+process UPSET {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "conda-forge::python=3.8.3 conda-forge::numpy=1.20.* conda-forge::pandas=1.2.* conda-forge::upsetplot=0.4.4"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-f42a44964bca5225c7860882e231a7b5488b5485:47ef981087c59f79fdbcab4d9d7316e9ac2e688d-0' :
+        'biocontainers/mulled-v2-f42a44964bca5225c7860882e231a7b5488b5485:47ef981087c59f79fdbcab4d9d7316e9ac2e688d-0' }"
+    input:
+    tuple val(meta), val(tools), path(beds)
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    output:
+    tuple val(meta), path("*.png"), emit: plot
+    path "*.upset_mqc.json"       , emit: multiqc
+    path "versions.yml"           , emit: versions
+
+    script:
+    template "upset.py"
+}
diff --git a/modules/local/upset/templates/upset.py b/modules/local/upset/templates/upset.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+import platform
+import upsetplot
+import matplotlib
+import matplotlib.pyplot as plt
+import distutils.version
+import base64
+import json
+
+def format_yaml_like(data: dict, indent: int = 0) -> str:
+    """Formats a dictionary to a YAML-like string.
+
+    Args:
+        data (dict): The dictionary to format.
+        indent (int): The current indentation level.
+
+    Returns:
+        str: A string formatted as YAML.
+    """
+    yaml_str = ""
+    for key, value in data.items():
+        spaces = "  " * indent
+        if isinstance(value, dict):
+            yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
+        else:
+            yaml_str += f"{spaces}{key}: {value}\\n"
+    return yaml_str
+
+df_tools = pd.DataFrame(
+    {
+        "tool": "${tools.join(' ')}".split(" "),
+        "file": "${beds.join(' ')}".split(" ")
+    }
+)
+
+tool_files = df_tools.groupby("tool").agg(lambda x: x.tolist())["file"].to_dict()
+tool_ids = {}
+
+for tool, files in tool_files.items():
+    df_tool = pd.concat([pd.read_csv(f, sep="\\t", header=None) for f in files])
+    tool_ids[tool] = set(df_tool[3].unique())
+
+dataset = upsetplot.from_contents(tool_ids)
+
+upsetplot.plot(dataset, orientation='horizontal', show_counts=True)
+plot_file = "${meta.id}.upset.png"
+plt.savefig(plot_file)
+
+image_string = base64.b64encode(open(plot_file, "rb").read()).decode("utf-8")
+image_html = f'<div class="mqc-custom-content-image"><img src="data:image/png;base64,{image_string}" /></div>'
+
+multiqc = {
+    'id': "${meta.id}_upset",
+    'parent_id': "upset_plots",
+    'parent_name': 'UpSet Plots',
+    'parent_description': 'UpSet plots showing the overlap between tools for each sample',
+    'section_name': 'UpSet: ${meta.id}',
+    'description': 'UpSet plot showing the overlap between tools for sample ${meta.id}',
+    'plot_type': 'image',
+    'data': image_html
+}
+
+with open("${meta.id}.upset_mqc.json", "w") as f:
+    f.write(json.dumps(multiqc, indent=4))
+
+# Create version file
+versions = {
+    "${task.process}" : {
+        "python": platform.python_version(),
+        "pandas": pd.__version__,
+        "upsetplot": upsetplot.__version__,
+        "matplotlib": matplotlib.__version__
+    }
+}
+
+with open("versions.yml", "w") as f:
+    f.write(format_yaml_like(versions))