Skip to content

Commit

Permalink
Merge branch 'dev' into ngscheckmate
Browse files Browse the repository at this point in the history
  • Loading branch information
SPPearce committed Nov 25, 2023
2 parents ddf7d73 + 700664f commit a4a946c
Show file tree
Hide file tree
Showing 9 changed files with 170 additions and 87 deletions.
28 changes: 24 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,33 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [dev]
## v3.14.0dev - [date]

### Enhancements and fixes
### Credits

### Enhancements & fixes

### Software dependencies

### Modules / Subworkflows

## [[3.13.2](https://github.com/nf-core/rnaseq/releases/tag/3.13.2)] - 2023-11-21

### Credits

Special thanks to the following for their contributions to the release:

- [Jonathan Manning](https://github.com/pinin4fjords)
- [Regina Hertfelder Reynolds](https://github.com/RHReynolds)
- [Matthias Zepper](https://github.com/MatthiasZepper)

### Enhancements & fixes

- [PR #1126](https://github.com/nf-core/rnaseq/pull/1126) - Fixes error when transcript_fasta not provided and skip_gtf_filter set to true
- [#1125](https://github.com/nf-core/rnaseq/issues/1125) - Pipeline fails if transcript_fasta not provided and skip_gtf_filter = true
- [PR #993](https://github.com/nf-core/rnaseq/pull/993) Added NGSCheckMate for checking that samples come from the same individual
- [PR #1123](https://github.com/nf-core/rnaseq/pull/1123) - Overhaul tximport.r, output length tables
- [PR #1124](https://github.com/nf-core/rnaseq/pull/1124) - Ensure pseudoaligner is set if pseudoalignment is not skipped
- [PR #1126](https://github.com/nf-core/rnaseq/pull/1126) - Pipeline fails if transcript_fasta not provided and `skip_gtf_filter = true`.
- [PR #1127](https://github.com/nf-core/rnaseq/pull/1127) - Enlarge sampling to determine the number of columns in `filter_gtf.py` script.

### Parameters

Expand Down
4 changes: 2 additions & 2 deletions bin/filter_gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@ def extract_fasta_seq_names(fasta_name: str) -> Set[str]:
def tab_delimited(file: str) -> float:
"""Check if file is tab-delimited and return median number of tabs."""
with open(file, "r") as f:
data = f.read(1024)
data = f.read(102400)
return statistics.median(line.count("\t") for line in data.split("\n"))


def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None:
"""Filter GTF file based on FASTA sequence names."""
if tab_delimited(gtf_in) != 8:
raise ValueError("Invalid GTF file: Expected 8 tab-separated columns.")
raise ValueError("Invalid GTF file: Expected 9 tab-separated columns.")

seq_names_in_genome = extract_fasta_seq_names(fasta)
logger.info(f"Extracted chromosome sequence names from {fasta}")
Expand Down
187 changes: 118 additions & 69 deletions bin/tximport.r
Original file line number Diff line number Diff line change
@@ -1,94 +1,143 @@
#!/usr/bin/env Rscript

# Written by Lorena Pantano and released under the MIT license.
# Script for importing and processing transcript-level quantifications.
# Written by Lorena Pantano, later modified by Jonathan Manning, and released
# under the MIT license.

# Loading required libraries
library(SummarizedExperiment)
library(tximport)

args = commandArgs(trailingOnly=TRUE)
# Parsing command line arguments
args <- commandArgs(trailingOnly=TRUE)
if (length(args) < 4) {
stop("Usage: tximport.r <coldata> <path> <sample_name> <quant_type> <tx2gene_path>", call.=FALSE)
stop("Usage: tximport.r <coldata_path> <path> <prefix> <quant_type> <tx2gene_path>",
call.=FALSE)
}

coldata = args[1]
path = args[2]
sample_name = args[3]
quant_type = args[4]
tx2gene_path = args[5]

prefix = sample_name

info = file.info(tx2gene_path)
if (info$size == 0) {
tx2gene = NULL
} else {
rowdata = read.csv(tx2gene_path, sep="\t", header = FALSE)
colnames(rowdata) = c("tx", "gene_id", "gene_name")
tx2gene = rowdata[,1:2]
# Assigning command line arguments to variables
coldata_path <- args[1]
path <- args[2]
prefix <- args[3]
quant_type <- args[4]
tx2gene_path <- args[5]

## Functions

# Build a table from a SummarizedExperiment object
build_table <- function(se.obj, slot) {
cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]])
}

pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf")
fns = list.files(path, pattern = pattern, recursive = T, full.names = T)
names = basename(dirname(fns))
names(fns) = names

if (file.exists(coldata)) {
coldata = read.csv(coldata, sep="\t")
coldata = coldata[match(names, coldata[,1]),]
coldata = cbind(files = fns, coldata)
} else {
message("ColData not available: ", coldata)
coldata = data.frame(files = fns, names = names)
# Write a table to a file with given parameters
write_se_table <- function(params) {
file_name <- paste0(prefix, ".", params$suffix)
write.table(build_table(params$obj, params$slot), file_name,
sep="\t", quote=FALSE, row.names = FALSE)
}

dropInfReps = quant_type == "kallisto"
# Read transcript metadata from a given path
read_transcript_info <- function(tinfo_path){
info <- file.info(tinfo_path)
if (info$size == 0) {
stop("tx2gene file is empty")
}

transcript_info <- read.csv(tinfo_path, sep="\t", header = FALSE,
col.names = c("tx", "gene_id", "gene_name"))

txi = tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps)
rownames(coldata) = coldata[["names"]]
extra = setdiff(rownames(txi[[1]]), as.character(rowdata[["tx"]]))
if (length(extra) > 0) {
rowdata = rbind(rowdata, data.frame(tx=extra, gene_id=extra, gene_name=extra))
extra <- setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]]))
transcript_info <- rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra))
transcript_info <- transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ]
rownames(transcript_info) <- transcript_info[["tx"]]

list(transcript = transcript_info,
gene = unique(transcript_info[,2:3]),
tx2gene = transcript_info[,1:2])
}
rowdata = rowdata[match(rownames(txi[[1]]), as.character(rowdata[["tx"]])),]
rownames(rowdata) = rowdata[["tx"]]
se = SummarizedExperiment(assays = list(counts = txi[["counts"]], abundance = txi[["abundance"]], length = txi[["length"]]),
colData = DataFrame(coldata),
rowData = rowdata)
if (!is.null(tx2gene)) {
gi = summarizeToGene(txi, tx2gene = tx2gene)
gi.ls = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM")
gi.s = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM")
growdata = unique(rowdata[,2:3])
growdata = growdata[match(rownames(gi[[1]]), growdata[["gene_id"]]),]
rownames(growdata) = growdata[["tx"]]
gse = SummarizedExperiment(assays = list(counts = gi[["counts"]], abundance = gi[["abundance"]], length = gi[["length"]]),
colData = DataFrame(coldata),
rowData = growdata)
gse.ls = SummarizedExperiment(assays = list(counts = gi.ls[["counts"]], abundance = gi.ls[["abundance"]], length = gi.ls[["length"]]),
colData = DataFrame(coldata),
rowData = growdata)
gse.s = SummarizedExperiment(assays = list(counts = gi.s[["counts"]], abundance = gi.s[["abundance"]], length = gi.s[["length"]]),
colData = DataFrame(coldata),
rowData = growdata)

# Read and process sample/column data from a given path
read_coldata <- function(coldata_path){
if (file.exists(coldata_path)) {
coldata <- read.csv(coldata_path, sep="\t")
coldata <- coldata[match(names, coldata[,1]),]
coldata <- cbind(files = fns, coldata)
} else {
message("ColData not available: ", coldata_path)
coldata <- data.frame(files = fns, names = names)
}
rownames(coldata) <- coldata[["names"]]
}

build_table = function(se.obj, slot) {
cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]])
# Create a SummarizedExperiment object with given data
create_summarized_experiment <- function(counts, abundance, length, col_data, row_data) {
SummarizedExperiment(assays = list(counts = counts, abundance = abundance, length = length),
colData = col_data,
rowData = row_data)
}

if(exists("gse")){
write.table(build_table(gse, "abundance"), paste(c(prefix, "gene_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(gse, "counts"), paste(c(prefix, "gene_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(gse.ls, "abundance"), paste(c(prefix, "gene_tpm_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(gse.ls, "counts"), paste(c(prefix, "gene_counts_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(gse.s, "abundance"), paste(c(prefix, "gene_tpm_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(gse.s, "counts"), paste(c(prefix, "gene_counts_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
# Main script starts here

# Define pattern for file names based on quantification type
pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf")
fns <- list.files(path, pattern = pattern, recursive = T, full.names = T)
names <- basename(dirname(fns))
names(fns) <- names
dropInfReps <- quant_type == "kallisto"

# Import transcript-level quantifications
txi <- tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps)

# Read transcript and sample data
transcript_info <- read_transcript_info(tx2gene_path)
coldata <- read_coldata(coldata_path)

# Create initial SummarizedExperiment object
se <- create_summarized_experiment(txi[["counts"]], txi[["abundance"]], txi[["length"]],
DataFrame(coldata), transcript_info$transcript)

# Setting parameters for writing tables
params <- list(
list(obj = se, slot = "abundance", suffix = "transcript_tpm.tsv"),
list(obj = se, slot = "counts", suffix = "transcript_counts.tsv"),
list(obj = se, slot = "length", suffix = "transcript_lengths.tsv")
)

# Process gene-level data if tx2gene mapping is available
if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info$tx2gene)) {
tx2gene <- transcript_info$tx2gene
gi <- summarizeToGene(txi, tx2gene = tx2gene)
gi.ls <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM")
gi.s <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM")

gene_info <- transcript_info$gene[match(rownames(gi[[1]]), transcript_info$gene[["gene_id"]]),]
rownames(gene_info) <- gene_info[["tx"]]

col_data_frame <- DataFrame(coldata)

# Create gene-level SummarizedExperiment objects
gse <- create_summarized_experiment(gi[["counts"]], gi[["abundance"]], gi[["length"]],
col_data_frame, gene_info)
gse.ls <- create_summarized_experiment(gi.ls[["counts"]], gi.ls[["abundance"]], gi.ls[["length"]],
col_data_frame, gene_info)
gse.s <- create_summarized_experiment(gi.s[["counts"]], gi.s[["abundance"]], gi.s[["length"]],
col_data_frame, gene_info)

params <- c(params, list(
list(obj = gse, slot = "length", suffix = "gene_lengths.tsv"),
list(obj = gse, slot = "abundance", suffix = "gene_tpm.tsv"),
list(obj = gse, slot = "counts", suffix = "gene_counts.tsv"),
list(obj = gse.ls, slot = "abundance", suffix = "gene_tpm_length_scaled.tsv"),
list(obj = gse.ls, slot = "counts", suffix = "gene_counts_length_scaled.tsv"),
list(obj = gse.s, slot = "abundance", suffix = "gene_tpm_scaled.tsv"),
list(obj = gse.s, slot = "counts", suffix = "gene_counts_scaled.tsv")
))
}

write.table(build_table(se, "abundance"), paste(c(prefix, "transcript_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
write.table(build_table(se, "counts"), paste(c(prefix, "transcript_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
# Writing tables for each set of parameters
done <- lapply(params, write_se_table)

# Print sessioninfo to standard out
# Output session information and citations
citation("tximeta")
sessionInfo()

3 changes: 1 addition & 2 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -1110,7 +1110,6 @@ if (!params.skip_multiqc) {

if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon') {
process {

withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:SALMON_QUANT' {
ext.args = { params.extra_salmon_quant_args ?: '' }
publishDir = [
Expand All @@ -1135,7 +1134,7 @@ if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'kallisto') {
}
}

if (!params.skip_pseudo_alignment) {
if (!params.skip_pseudo_alignment && params.pseudo_aligner) {
process {
withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:TX2GENE' {
publishDir = [
Expand Down
2 changes: 2 additions & 0 deletions modules/local/tximport/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ process TXIMPORT {
path "*gene_counts.tsv" , emit: counts_gene
path "*gene_counts_length_scaled.tsv", emit: counts_gene_length_scaled
path "*gene_counts_scaled.tsv" , emit: counts_gene_scaled
path "*gene_lengths.tsv" , emit: lengths_gene
path "*transcript_tpm.tsv" , emit: tpm_transcript
path "*transcript_counts.tsv" , emit: counts_transcript
path "*transcript_lengths.tsv" , emit: lengths_transcript
path "versions.yml" , emit: versions

when:
Expand Down
2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ manifest {
description = """RNA sequencing analysis pipeline for gene/isoform quantification and extensive quality control."""
mainScript = 'main.nf'
nextflowVersion = '!>=23.04.0'
version = '3.13.1'
version = '3.14.0dev'
doi = 'https://doi.org/10.5281/zenodo.1400710'
}

Expand Down
14 changes: 8 additions & 6 deletions subworkflows/local/quantify_pseudo/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,14 @@ workflow QUANTIFY_PSEUDO_ALIGNMENT {
results = ch_pseudo_results // channel: [ val(meta), results_dir ]
multiqc = ch_pseudo_multiqc // channel: [ val(meta), files_for_multiqc ]

tpm_gene = TXIMPORT.out.tpm_gene // channel: [ val(meta), counts ]
counts_gene = TXIMPORT.out.counts_gene // channel: [ val(meta), counts ]
counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // channel: [ val(meta), counts ]
counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // channel: [ val(meta), counts ]
tpm_transcript = TXIMPORT.out.tpm_transcript // channel: [ val(meta), counts ]
counts_transcript = TXIMPORT.out.counts_transcript // channel: [ val(meta), counts ]
tpm_gene = TXIMPORT.out.tpm_gene // path *gene_tpm.tsv
counts_gene = TXIMPORT.out.counts_gene // path *gene_counts.tsv
lengths_gene = TXIMPORT.out.lengths_gene // path *gene_lengths.tsv
counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // path *gene_counts_length_scaled.tsv
counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // path *gene_counts_scaled.tsv
tpm_transcript = TXIMPORT.out.tpm_transcript // path *gene_tpm.tsv
counts_transcript = TXIMPORT.out.counts_transcript // path *transcript_counts.tsv
lengths_transcript = TXIMPORT.out.lengths_transcript // path *transcript_lengths.tsv

merged_gene_rds = SE_GENE.out.rds // path: *.rds
merged_gene_rds_length_scaled = SE_GENE_LENGTH_SCALED.out.rds // path: *.rds
Expand Down
12 changes: 12 additions & 0 deletions tower.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ reports:
display: "All samples STAR Salmon DESeq2 QC PDF plots"
"**/star_salmon/salmon.merged.gene_counts.tsv":
display: "All samples STAR Salmon merged gene raw counts"
"**/star_salmon/salmon.merged.gene_lengths.tsv":
display: "All samples STAR Salmon mean transcript length per gene"
"**/star_salmon/salmon.merged.transcript_lengths.tsv":
display: "All samples STAR Salmon transcript lengths"
"**/star_salmon/salmon.merged.gene_tpm.tsv":
display: "All samples STAR Salmon merged gene TPM counts"
"**/star_salmon/salmon.merged.transcript_counts.tsv":
Expand All @@ -15,6 +19,10 @@ reports:
display: "All samples Salmon DESeq2 QC PDF plots"
"**/salmon/salmon.merged.gene_counts.tsv":
display: "All samples Salmon merged gene raw counts"
"**/salmon/salmon.merged.gene_lengths.tsv":
display: "All samples Salmon mean transcript length per gene"
"**/salmon/salmon.merged.transcript_lengths.tsv":
display: "All samples Salmon transcript lengths"
"**/salmon/salmon.merged.gene_tpm.tsv":
display: "All samples Salmon merged gene TPM counts"
"**/salmon/salmon.merged.transcript_counts.tsv":
Expand All @@ -25,6 +33,10 @@ reports:
display: "All samples Kallisto DESeq2 QC PDF plots"
"**/kallisto/kallisto.merged.gene_counts.tsv":
display: "All samples Kallisto merged gene raw counts"
"**/kallisto/kallisto.merged.gene_lengths.tsv":
display: "All samples Kallisto mean transcript length per gene"
"**/kallisto/kallisto.merged.transcript_lengths.tsv":
display: "All samples Kallisto transcript lengths"
"**/kallisto/kallisto.merged.gene_tpm.tsv":
display: "All samples Kallisto merged gene TPM counts"
"**/kallisto/kallisto.merged.transcript_counts.tsv":
Expand Down
5 changes: 2 additions & 3 deletions workflows/rnaseq.nf
Original file line number Diff line number Diff line change
Expand Up @@ -829,9 +829,8 @@ workflow RNASEQ {
//
ch_pseudo_multiqc = Channel.empty()
ch_pseudoaligner_pca_multiqc = Channel.empty()
ch_pseudoaligner_clustering_multiqc = Channel.empty()

if (!params.skip_pseudo_alignment) {
ch_pseudoaligner_clustering_multiqc = Channel.empty()
if (!params.skip_pseudo_alignment && params.pseudo_aligner) {

if (params.pseudo_aligner == 'salmon') {
ch_pseudo_index = PREPARE_GENOME.out.salmon_index
Expand Down

0 comments on commit a4a946c

Please sign in to comment.