Skip to content

Commit

Permalink
Merge pull request #106 from nf-core/common-quantification
Browse files Browse the repository at this point in the history
Implement common quantification
  • Loading branch information
nictru committed Apr 25, 2024
2 parents c29124f + eceacd7 commit 8eb1b22
Show file tree
Hide file tree
Showing 93 changed files with 3,849 additions and 520 deletions.
8 changes: 4 additions & 4 deletions bin/DEA.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ get_args <- function(){
arg="circRNA",
short="c",
help="circRNA counts matrix",
default="circRNA_matrix.txt")
default="merged_counts.bed")

argp <- add_argument(
parser=argp,
Expand Down Expand Up @@ -57,7 +57,7 @@ giveError <- function(message){
quit()
}

usage <- function(){giveError("USAGE: DEA.R <gene_counts.csv> <phenotype.txt> <circRNA_matrix.txt> <species id> <ensembl_map>")}
usage <- function(){giveError("USAGE: DEA.R <gene_counts.csv> <phenotype.txt> <merged_counts.bed> <species id> <ensembl_map>")}


stage_data <- function(gene_counts, phenotype, circRNA, species, map){
Expand All @@ -69,9 +69,9 @@ stage_data <- function(gene_counts, phenotype, circRNA, species, map){
circ <- read.table(circRNA, sep ="\t", header = T, stringsAsFactors=FALSE)

# Merge circRNA genomic loci to ID
circ$circ <- with(circ, paste0(Chr, sep=":", Start, sep="-", Stop, sep=":", Strand))
circ$circ <- with(circ, paste0(chr, sep=":", start, sep="-", end, sep=":", strand))
rownames(circ) <- circ$circ
circ <- subset(circ, select=-c(Chr, Start, Stop, Strand, circ))
circ <- subset(circ, select=-c(chr, start, end, strand, circ))

# R converts '-' to '.' in colnames here and results in failures.
# If you need to make this 'smarter' check that colnames contains '.',
Expand Down
27 changes: 21 additions & 6 deletions bin/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
import pandas as pd
import numpy as np
import argparse
import csv

parser = argparse.ArgumentParser(description='Annotate circRNAs')
parser.add_argument('--input', type=str, help='Path to the input file')
parser.add_argument('--exon_boundary', type=int, help='Exon boundary')
parser.add_argument('--output', type=str, help='Path to the output file')
parser.add_argument('--output_bed', type=str, help='Path to the output bed file')
parser.add_argument('--output_gtf', type=str, help='Path to the output gtf')

args = parser.parse_args()

Expand All @@ -24,7 +26,7 @@
14: 'attributes'
}

attributes = ['gene_id', 'transcript_id']
attributes = ['gene_id', 'gene_name', 'transcript_id']

exon_boundary = args.exon_boundary

Expand All @@ -36,8 +38,9 @@
df_intergenic = df[mask]
df = df[~mask]
df_intergenic['type'] = 'intergenic-circRNA'
df_intergenic['gene_id'] = 'NaN'
df_intergenic['transcript_id'] = 'NaN'
df_intergenic['gene_id'] = 'intergenic_' + df_intergenic['name']
df_intergenic['gene_name'] = 'intergenic_' + df_intergenic['name']
df_intergenic['transcript_id'] = 'intergenic_' + df_intergenic['name']

# Convert attributes to a dictionary
df['attributes'] = df['attributes'].apply(lambda row: dict([[value.strip(r'"') for value in entry.strip().split(' ')] for entry in row.split(';') if entry]))
Expand All @@ -56,6 +59,7 @@
'name': lambda x: x.iloc[0],
'score': lambda x: x.iloc[0],
'gene_id': lambda x: list(x),
'gene_name': lambda x: list(x),
'transcript_id': lambda x: list(x),
'perfect': lambda x: list(x)
})
Expand All @@ -79,13 +83,14 @@ def determine_type(row):
df['no_transcript'] = df['transcript_id'].apply(lambda x: all([type(value) != str and np.isnan(value) for value in x]))
df['type'] = df.apply(lambda row: determine_type(row), axis=1)
df['gene_id'] = df.apply(lambda row: filter_perfect(row, 'gene_id'), axis=1)
df['gene_name'] = df.apply(lambda row: filter_perfect(row, 'gene_name'), axis=1)
df['transcript_id'] = df.apply(lambda row: filter_perfect(row, 'transcript_id'), axis=1)
# Drop perfect
df = df.drop(['perfect'], axis=1)

df = df.reset_index()
df_intergenic = df_intergenic.reset_index()
bed_order = ['chr', 'start', 'end', 'name', 'score', 'strand', 'type', 'gene_id', 'transcript_id']
bed_order = ['chr', 'start', 'end', 'name', 'score', 'strand', 'type', 'gene_id', 'gene_name', 'transcript_id']
df = df[bed_order]
df_intergenic = df_intergenic[bed_order]

Expand All @@ -94,4 +99,14 @@ def determine_type(row):
# Sort by chr, start, end
df = df.sort_values(['chr', 'start', 'end'])

df.to_csv(args.output, sep='\t', index=False, header=False)
df.to_csv(args.output_bed, sep='\t', index=False, header=False)

# Convert to GTF
df['source'] = 'circRNA'
df['frame'] = '.'
df['attributes'] = 'gene_id "' + df['gene_id'] + '"; gene_name "' + df['gene_name'] + '"; transcript_id "circ_' + df['name'] + '";'

gtf_order = ['chr', 'source', 'type', 'start', 'end', 'score', 'strand', 'frame', 'attributes']
df = df[gtf_order]

df.to_csv(args.output_gtf, sep='\t', index=False, header=False, quoting=csv.QUOTE_NONE)
50 changes: 0 additions & 50 deletions bin/circRNA_counts_matrix.py

This file was deleted.

16 changes: 16 additions & 0 deletions bin/combine_quantification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env python3

import argparse
import pandas as pd
import os

parser = argparse.ArgumentParser(description='Merge quantification files into a single file')
parser.add_argument('--inputs', type=str, nargs='+', help='List input files to merge')
parser.add_argument('--output', type=str, help='Path to output combined quantification file')

args = parser.parse_args()

dfs = [pd.read_csv(f, sep='\t', index_col=[0, 1]) for f in args.inputs]
df_combined = pd.concat(dfs, axis=1, sort=True)

df_combined.to_csv(args.output, sep='\t')
134 changes: 0 additions & 134 deletions bin/consolidate_algorithms_intersection.R

This file was deleted.

31 changes: 31 additions & 0 deletions bin/counts_combined.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python3

import argparse
import pandas as pd
import os

parser = argparse.ArgumentParser(description='Merge counts across tools for a single sample')
parser.add_argument('--beds', type=str, nargs='+', help='List of bed files to merge')
parser.add_argument('--out_bed', type=str, help='Output bed file')
parser.add_argument('--out_tsv', type=str, help='Output tsv file')

args = parser.parse_args()

columns = ['chr', 'start', 'end', 'strand', 'count', 'tools']
dfs = {os.path.basename(bed).split('.')[0]: pd.read_csv(bed,
sep='\t',
header=None,
names=columns,
index_col=[0, 1, 2, 3])
.drop('tools', axis=1) for bed in args.beds}

dfs = [df.rename(columns={'count': sample}) for sample, df in dfs.items()]

df = pd.concat(dfs, axis=1)
df = df.fillna(0).astype(int)

df.to_csv(args.out_bed, sep='\t')

df.index = df.index.map(lambda x: f'{x[0]}:{x[1]}-{x[2]}:{x[3]}')
df.index.name = 'ID'
df.to_csv(args.out_tsv, sep='\t')
24 changes: 24 additions & 0 deletions bin/merge_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env python3

import argparse
import pandas as pd

parser = argparse.ArgumentParser(description='Merge counts across tools for a single sample')
parser.add_argument('--beds', type=str, nargs='+', help='List of bed files to merge')
parser.add_argument('--tool_filter', type=int, help='Minimum number of tools to keep a circRNA')
parser.add_argument('--duplicates_fun', type=str, help='Function to apply to duplicates', choices=['sum', 'mean', 'max', 'min'])
parser.add_argument('--output', type=str, help='Output file')

args = parser.parse_args()

columns = ['chr', 'start', 'end', 'strand', 'count']
dfs = [pd.read_csv(bed, sep='\t', header=None, names=columns) for bed in args.beds]
df = pd.concat(dfs)

df['tool_count'] = 1

df = df.groupby(['chr', 'start', 'end', 'strand']).agg({'count': args.duplicates_fun,
'tool_count': 'sum'}).reset_index()
df = df[df['tool_count'] >= args.tool_filter]

df.to_csv(args.output, sep='\t', index=False, header=False)
2 changes: 1 addition & 1 deletion bin/prepare_circ_test.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
## Author: Barry Digby
## License: MIT

circ_mat = read.table("count_matrix.txt", header=T, sep="\t", check.names = FALSE, stringsAsFactors = F, row.names = "ID")
circ_mat = read.table("merged_counts.tsv", header=T, sep="\t", check.names = FALSE, stringsAsFactors = F, row.names = "ID")
gene_mat = read.table("gene_count_matrix.csv", sep=",", header=T, row.names="gene_id", stringsAsFactors = F)
map = read.table("circrna_host-gene.txt", header = F, sep="\t", stringsAsFactors = F)

Expand Down
16 changes: 0 additions & 16 deletions bin/reformat_count_matrix.R

This file was deleted.

Loading

0 comments on commit 8eb1b22

Please sign in to comment.