Merge pull request #106 from nf-core/common-quantification

Implement common quantification
nf-core · Apr 25, 2024 · 8eb1b22 · 8eb1b22
2 parents c29124f + eceacd7
commit 8eb1b22
Show file tree

Hide file tree

Showing 93 changed files with 3,849 additions and 520 deletions.
diff --git a/bin/DEA.R b/bin/DEA.R
@@ -28,7 +28,7 @@ get_args <- function(){
             arg="circRNA",
             short="c",
             help="circRNA counts matrix",
-            default="circRNA_matrix.txt")
+            default="merged_counts.bed")
 
     argp <- add_argument(
             parser=argp,
@@ -57,7 +57,7 @@ giveError <- function(message){
     quit()
 }
 
-usage <- function(){giveError("USAGE: DEA.R <gene_counts.csv> <phenotype.txt> <circRNA_matrix.txt> <species id> <ensembl_map>")}
+usage <- function(){giveError("USAGE: DEA.R <gene_counts.csv> <phenotype.txt> <merged_counts.bed> <species id> <ensembl_map>")}
 
 
 stage_data <- function(gene_counts, phenotype, circRNA, species, map){
@@ -69,9 +69,9 @@ stage_data <- function(gene_counts, phenotype, circRNA, species, map){
     circ <- read.table(circRNA, sep ="\t", header = T, stringsAsFactors=FALSE)
 
     # Merge circRNA genomic loci to ID
-    circ$circ <- with(circ, paste0(Chr, sep=":", Start, sep="-", Stop, sep=":", Strand))
+    circ$circ <- with(circ, paste0(chr, sep=":", start, sep="-", end, sep=":", strand))
     rownames(circ) <- circ$circ
-    circ <- subset(circ, select=-c(Chr, Start, Stop, Strand, circ))
+    circ <- subset(circ, select=-c(chr, start, end, strand, circ))
 
     # R converts '-' to '.' in colnames here and results in failures.
     # If you need to make this 'smarter' check that colnames contains '.',

diff --git a/bin/annotation.py b/bin/annotation.py
@@ -3,11 +3,13 @@
 import pandas as pd
 import numpy as np
 import argparse
+import csv
 
 parser = argparse.ArgumentParser(description='Annotate circRNAs')
 parser.add_argument('--input', type=str, help='Path to the input file')
 parser.add_argument('--exon_boundary', type=int, help='Exon boundary')
-parser.add_argument('--output', type=str, help='Path to the output file')
+parser.add_argument('--output_bed', type=str, help='Path to the output bed file')
+parser.add_argument('--output_gtf', type=str, help='Path to the output gtf')
 
 args = parser.parse_args()
 
@@ -24,7 +26,7 @@
     14: 'attributes'
 }
 
-attributes = ['gene_id', 'transcript_id']
+attributes = ['gene_id', 'gene_name', 'transcript_id']
 
 exon_boundary = args.exon_boundary
 
@@ -36,8 +38,9 @@
 df_intergenic = df[mask]
 df = df[~mask]
 df_intergenic['type'] = 'intergenic-circRNA'
-df_intergenic['gene_id'] = 'NaN'
-df_intergenic['transcript_id'] = 'NaN'
+df_intergenic['gene_id'] = 'intergenic_' + df_intergenic['name']
+df_intergenic['gene_name'] = 'intergenic_' + df_intergenic['name']
+df_intergenic['transcript_id'] = 'intergenic_' + df_intergenic['name']
 
 # Convert attributes to a dictionary
 df['attributes'] = df['attributes'].apply(lambda row: dict([[value.strip(r'"') for value in entry.strip().split(' ')] for entry in row.split(';') if entry]))
@@ -56,6 +59,7 @@
     'name': lambda x: x.iloc[0],
     'score': lambda x: x.iloc[0],
     'gene_id': lambda x: list(x),
+    'gene_name': lambda x: list(x),
     'transcript_id': lambda x: list(x),
     'perfect': lambda x: list(x)
 })
@@ -79,13 +83,14 @@ def determine_type(row):
 df['no_transcript'] = df['transcript_id'].apply(lambda x: all([type(value) != str and np.isnan(value) for value in x]))
 df['type'] = df.apply(lambda row: determine_type(row), axis=1)
 df['gene_id'] = df.apply(lambda row: filter_perfect(row, 'gene_id'), axis=1)
+df['gene_name'] = df.apply(lambda row: filter_perfect(row, 'gene_name'), axis=1)
 df['transcript_id'] = df.apply(lambda row: filter_perfect(row, 'transcript_id'), axis=1)
 # Drop perfect
 df = df.drop(['perfect'], axis=1)
 
 df = df.reset_index()
 df_intergenic = df_intergenic.reset_index()
-bed_order = ['chr', 'start', 'end', 'name', 'score', 'strand', 'type', 'gene_id', 'transcript_id']
+bed_order = ['chr', 'start', 'end', 'name', 'score', 'strand', 'type', 'gene_id', 'gene_name', 'transcript_id']
 df = df[bed_order]
 df_intergenic = df_intergenic[bed_order]
 
@@ -94,4 +99,14 @@ def determine_type(row):
 # Sort by chr, start, end
 df = df.sort_values(['chr', 'start', 'end'])
 
-df.to_csv(args.output, sep='\t', index=False, header=False)
+df.to_csv(args.output_bed, sep='\t', index=False, header=False)
+
+# Convert to GTF
+df['source'] = 'circRNA'
+df['frame'] = '.'
+df['attributes'] = 'gene_id "' + df['gene_id'] + '"; gene_name "' + df['gene_name'] + '"; transcript_id "circ_' + df['name'] + '";'
+
+gtf_order = ['chr', 'source', 'type', 'start', 'end', 'score', 'strand', 'frame', 'attributes']
+df = df[gtf_order]
+
+df.to_csv(args.output_gtf, sep='\t', index=False, header=False, quoting=csv.QUOTE_NONE)
diff --git a/bin/circRNA_counts_matrix.py b/bin/circRNA_counts_matrix.py
diff --git a/bin/combine_quantification.py b/bin/combine_quantification.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+
+import argparse
+import pandas as pd
+import os
+
+parser = argparse.ArgumentParser(description='Merge quantification files into a single file')
+parser.add_argument('--inputs', type=str, nargs='+', help='List input files to merge')
+parser.add_argument('--output', type=str, help='Path to output combined quantification file')
+
+args = parser.parse_args()
+
+dfs = [pd.read_csv(f, sep='\t', index_col=[0, 1]) for f in args.inputs]
+df_combined = pd.concat(dfs, axis=1, sort=True)
+
+df_combined.to_csv(args.output, sep='\t')
diff --git a/bin/consolidate_algorithms_intersection.R b/bin/consolidate_algorithms_intersection.R
diff --git a/bin/counts_combined.py b/bin/counts_combined.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+import argparse
+import pandas as pd
+import os
+
+parser = argparse.ArgumentParser(description='Merge counts across tools for a single sample')
+parser.add_argument('--beds', type=str, nargs='+', help='List of bed files to merge')
+parser.add_argument('--out_bed', type=str, help='Output bed file')
+parser.add_argument('--out_tsv', type=str, help='Output tsv file')
+
+args = parser.parse_args()
+
+columns = ['chr', 'start', 'end', 'strand', 'count', 'tools']
+dfs = {os.path.basename(bed).split('.')[0]: pd.read_csv(bed,
+                   sep='\t',
+                   header=None,
+                   names=columns,
+                   index_col=[0, 1, 2, 3])
+                   .drop('tools', axis=1) for bed in args.beds}
+
+dfs = [df.rename(columns={'count': sample}) for sample, df in dfs.items()]
+
+df = pd.concat(dfs, axis=1)
+df = df.fillna(0).astype(int)
+
+df.to_csv(args.out_bed, sep='\t')
+
+df.index = df.index.map(lambda x: f'{x[0]}:{x[1]}-{x[2]}:{x[3]}')
+df.index.name = 'ID'
+df.to_csv(args.out_tsv, sep='\t')
diff --git a/bin/merge_tools.py b/bin/merge_tools.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python3
+
+import argparse
+import pandas as pd
+
+parser = argparse.ArgumentParser(description='Merge counts across tools for a single sample')
+parser.add_argument('--beds', type=str, nargs='+', help='List of bed files to merge')
+parser.add_argument('--tool_filter', type=int, help='Minimum number of tools to keep a circRNA')
+parser.add_argument('--duplicates_fun', type=str, help='Function to apply to duplicates', choices=['sum', 'mean', 'max', 'min'])
+parser.add_argument('--output', type=str, help='Output file')
+
+args = parser.parse_args()
+
+columns = ['chr', 'start', 'end', 'strand', 'count']
+dfs = [pd.read_csv(bed, sep='\t', header=None, names=columns) for bed in args.beds]
+df = pd.concat(dfs)
+
+df['tool_count'] = 1
+
+df = df.groupby(['chr', 'start', 'end', 'strand']).agg({'count': args.duplicates_fun,
+                                                        'tool_count': 'sum'}).reset_index()
+df = df[df['tool_count'] >= args.tool_filter]
+
+df.to_csv(args.output, sep='\t', index=False, header=False)
diff --git a/bin/prepare_circ_test.R b/bin/prepare_circ_test.R
@@ -3,7 +3,7 @@
 ## Author: Barry Digby
 ## License: MIT
 
-circ_mat = read.table("count_matrix.txt", header=T, sep="\t", check.names = FALSE, stringsAsFactors = F, row.names = "ID")
+circ_mat = read.table("merged_counts.tsv", header=T, sep="\t", check.names = FALSE, stringsAsFactors = F, row.names = "ID")
 gene_mat = read.table("gene_count_matrix.csv", sep=",", header=T, row.names="gene_id", stringsAsFactors = F)
 map = read.table("circrna_host-gene.txt", header = F, sep="\t", stringsAsFactors = F)
 

diff --git a/bin/reformat_count_matrix.R b/bin/reformat_count_matrix.R