Skip to content

Commit

Permalink
Merge pull request #117 from nf-core/exclude-overlong-transcripts
Browse files Browse the repository at this point in the history
Fix some bugs and add upset plots
  • Loading branch information
nictru committed May 10, 2024
2 parents 0f10c05 + 86af7b2 commit 5ddc061
Show file tree
Hide file tree
Showing 13 changed files with 252 additions and 94 deletions.
2 changes: 1 addition & 1 deletion bin/DEA.R
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ DESeq2 <- function(inputdata, data_type){
rm(tmp)

dds <- DESeqDataSetFromMatrix(
countData=inputdata$circ,
countData=round(inputdata$circ),
colData=inputdata$pheno,
design = inputdata$design)

Expand Down
14 changes: 6 additions & 8 deletions bin/counts_combined.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,18 @@

args = parser.parse_args()

columns = ['chr', 'start', 'end', 'strand', 'count', 'tools']
columns = ['chr', 'start', 'end', 'name', 'count', 'strand']
dfs = {os.path.basename(bed).split('.')[0]: pd.read_csv(bed,
sep='\t',
header=None,
names=columns,
index_col=[0, 1, 2, 3])
.drop('tools', axis=1) for bed in args.beds}
index_col=["chr", "start", "end", "strand"],
usecols=["chr", "start", "end", "strand", "count"],
names=columns) for bed in args.beds}

dfs = [df.rename(columns={'count': sample}) for sample, df in dfs.items()]

df = pd.concat(dfs, axis=1)
df = df.fillna(0).astype(int)

df.to_csv(args.out_bed, sep='\t')
df = df.fillna(0)
df.to_csv(args.out_bed, sep='\t', header=True, index=True)

df.index = df.index.map(lambda x: f'{x[0]}:{x[1]}-{x[2]}:{x[3]}')
df.index.name = 'ID'
Expand Down
5 changes: 5 additions & 0 deletions bin/merge_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,9 @@
'tool_count': 'sum'}).reset_index()
df = df[df['tool_count'] >= args.tool_filter]

df.drop('tool_count', axis=1, inplace=True)
df["name"] = df["chr"] + ":" + df["start"].astype(str) + "-" + df["end"].astype(str) + ":" + df["strand"]

df = df[['chr', 'start', 'end', 'name', 'count', 'strand']]

df.to_csv(args.output, sep='\t', index=False, header=False)
13 changes: 13 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,14 @@ if (!params.skip_trimming) {
]
}

withName: UPSET_SAMPLES {
ext.when = { params.tool.split(',').length > 1 }
}

withName: UPSET_ALL {
ext.when = { params.tool.split(',').length > 1 }
}

withName: INTERSECT_ANNOTATION {
ext.args = "-loj"
ext.suffix = "intersect.bed"
Expand Down Expand Up @@ -601,6 +609,11 @@ if (!params.skip_trimming) {
ext.suffix = "gtf"
}

withName: EXCLUDE_OVERLONG_TRANSCRIPTS {
ext.args = "-v FS='\\t' -v OFS='\\t' '\$5-\$4 <= 10000 { print }'"
ext.suffix = "filtered.gtf"
}

withName: MARK_CIRCULAR {
// GAWK process that marks FASTA headers.
// Leaves headers starting with "ENS" and non-header lines as is.
Expand Down
2 changes: 1 addition & 1 deletion modules/local/annotation/full_annotation/main.nf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
process ANNOTATION {
tag "$meta.id"
tag "$meta.id:$meta.tool"
label 'process_single'

conda "bioconda::pandas=1.5.2"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ channels:
- bioconda
- defaults
dependencies:
- "bioconda::bioconductor-summarizedexperiment=1.32.0"
- "bioconda::bioconductor-rtracklayer==1.62.0--r43ha9d7317_0"
8 changes: 5 additions & 3 deletions modules/local/quantification/merge_experiments/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ process MERGE_EXPERIMENTS {

conda "${moduleDir}/environment.yaml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/bioconductor-summarizedexperiment:1.32.0--r43hdfd78af_0' :
'biocontainers/bioconductor-summarizedexperiment:1.32.0--r43hdfd78af_0' }"
'https://depot.galaxyproject.org/singularity/bioconductor-rtracklayer:1.62.0--r43ha9d7317_0' :
'biocontainers/bioconductor-rtracklayer:1.62.0--r43ha9d7317_0' }"

input:
tuple val(meta), path(experiments)
tuple val(meta), path(experiments)
tuple val(meta2), path(phenotype)
tuple val(meta3), path(gtf)
tuple val(meta4), path(tpm)

output:
tuple val(meta), path("${meta.id}.merged.rds"), emit: merged
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ paths <- c('${experiments.join("\', \'")}')
experiments <- lapply(paths, readRDS)

phenotype <- read.csv('${phenotype}', stringsAsFactors = FALSE)
annotation <- rtracklayer::import('${gtf}')
tpm <- read.table('${tpm}', header=TRUE, row.names=1)[, -1]

se_assays <- list()

Expand Down Expand Up @@ -37,6 +39,13 @@ for (col in colnames(colData(se))) {
}
}

# Add transcript annotation
annotation <- annotation[match(rownames(se), annotation\$transcript_id),]
rowData(se) <- annotation

# Add TPM
assay(se, "tpm", withDimnames = FALSE) <- tpm[rownames(se), colData(se)\$names]

saveRDS(se, '${meta.id}.merged.rds')

writeLines(
Expand Down
22 changes: 22 additions & 0 deletions modules/local/upset/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
process UPSET {
tag "$meta.id"
label 'process_single'

conda "conda-forge::python=3.8.3 conda-forge::numpy=1.20.* conda-forge::pandas=1.2.* conda-forge::upsetplot=0.4.4"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/mulled-v2-f42a44964bca5225c7860882e231a7b5488b5485:47ef981087c59f79fdbcab4d9d7316e9ac2e688d-0' :
'biocontainers/mulled-v2-f42a44964bca5225c7860882e231a7b5488b5485:47ef981087c59f79fdbcab4d9d7316e9ac2e688d-0' }"
input:
tuple val(meta), val(tools), path(beds)

when:
task.ext.when == null || task.ext.when

output:
tuple val(meta), path("*.png"), emit: plot
path "*.upset_mqc.json" , emit: multiqc
path "versions.yml" , emit: versions

script:
template "upset.py"
}
79 changes: 79 additions & 0 deletions modules/local/upset/templates/upset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env python3

import pandas as pd
import platform
import upsetplot
import matplotlib
import matplotlib.pyplot as plt
import distutils.version
import base64
import json

def format_yaml_like(data: dict, indent: int = 0) -> str:
"""Formats a dictionary to a YAML-like string.
Args:
data (dict): The dictionary to format.
indent (int): The current indentation level.
Returns:
str: A string formatted as YAML.
"""
yaml_str = ""
for key, value in data.items():
spaces = " " * indent
if isinstance(value, dict):
yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
else:
yaml_str += f"{spaces}{key}: {value}\\n"
return yaml_str

df_tools = pd.DataFrame(
{
"tool": "${tools.join(' ')}".split(" "),
"file": "${beds.join(' ')}".split(" ")
}
)

tool_files = df_tools.groupby("tool").agg(lambda x: x.tolist())["file"].to_dict()
tool_ids = {}

for tool, files in tool_files.items():
df_tool = pd.concat([pd.read_csv(f, sep="\\t", header=None) for f in files])
tool_ids[tool] = set(df_tool[3].unique())

dataset = upsetplot.from_contents(tool_ids)

upsetplot.plot(dataset, orientation='horizontal', show_counts=True)
plot_file = "${meta.id}.upset.png"
plt.savefig(plot_file)

image_string = base64.b64encode(open(plot_file, "rb").read()).decode("utf-8")
image_html = f'<div class="mqc-custom-content-image"><img src="data:image/png;base64,{image_string}" /></div>'

multiqc = {
'id': "${meta.id}_upset",
'parent_id': "upset_plots",
'parent_name': 'UpSet Plots',
'parent_description': 'UpSet plots showing the overlap between tools for each sample',
'section_name': 'UpSet: ${meta.id}',
'description': 'UpSet plot showing the overlap between tools for sample ${meta.id}',
'plot_type': 'image',
'data': image_html
}

with open("${meta.id}.upset_mqc.json", "w") as f:
f.write(json.dumps(multiqc, indent=4))

# Create version file
versions = {
"${task.process}" : {
"python": platform.python_version(),
"pandas": pd.__version__,
"upsetplot": upsetplot.__version__,
"matplotlib": matplotlib.__version__
}
}

with open("versions.yml", "w") as f:
f.write(format_yaml_like(versions))
Loading

0 comments on commit 5ddc061

Please sign in to comment.