nf-core · drpatelh · Dec 11, 2019 · Dec 6, 2019 · Dec 6, 2019 · Dec 6, 2019
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -36,7 +36,7 @@
 ############################################
 
 ERROR_STR = 'ERROR: Please check samplesheet'
-HEADER = ['sample', 'fastq', 'barcode', 'genome']
+HEADER = ['sample', 'fastq', 'barcode', 'genome', 'transcriptome']
 
 ## CHECK HEADER
 fin = open(args.DESIGN_FILE_IN,'r')
@@ -50,49 +50,71 @@
     line = fin.readline()
     if line:
         lspl = [x.strip() for x in line.strip().split(',')]
-        sample,fastq,barcode,genome = lspl
+        sample,fastq,barcode,genome,transcriptome = lspl
 
         ## CHECK VALID NUMBER OF COLUMNS PER SAMPLE
         numCols = len([x for x in lspl if x])
         if numCols < 2:
             print("{}: Invalid number of columns (minimum of 2)!\nLine: '{}'".format(ERROR_STR,line.strip()))
             sys.exit(1)
 
+        ## CHECK SAMPLE ID ENTRIES
         if sample:
-            ## CHECK SAMPLE ID HAS NO SPACES
             if sample.find(' ') != -1:
                 print("{}: Sample ID contains spaces!\nLine: '{}'".format(ERROR_STR,line.strip()))
                 sys.exit(1)
         else:
             print("{}: Sample ID not specified!\nLine: '{}'".format(ERROR_STR,line.strip()))
             sys.exit(1)
 
+        ## CHECK BARCODE ENTRIES
         if barcode:
-            ## CHECK BARCODE COLUMN IS INTEGER
             if not barcode.isdigit():
                 print("{}: Barcode not an integer!\nLine: '{}'".format(ERROR_STR,line.strip()))
                 sys.exit(1)
             else:
                 barcode = 'barcode%s' % (barcode.zfill(2))
 
+        ## CHECK FASTQ ENTRIES
         if fastq:
-            ## CHECK FASTQ FILE EXTENSION
             if fastq[-9:] != '.fastq.gz' and fastq[-6:] != '.fq.gz':
                 print("{}: FastQ file has incorrect extension (has to be '.fastq.gz' or '.fq.gz')!\nLine: '{}'".format(ERROR_STR,line.strip()))
+                sys.exit(1)
 
+        ## CHECK GENOME ENTRIES
         if genome:
-            ## CHECK GENOME HAS NO SPACES
             if genome.find(' ') != -1:
                 print("{}: Genome field contains spaces!\nLine: '{}'".format(ERROR_STR,line.strip()))
                 sys.exit(1)
 
-            ## CHECK GENOME EXTENSION
             if len(genome.split('.')) > 1:
                 if genome[-6:] != '.fasta' and genome[-3:] != '.fa' and genome[-9:] != '.fasta.gz' and genome[-6:] != '.fa.gz':
-                    print("{}: Genome field incorrect extension (has to be '.fasta' or '.fa' or '.fasta.gz' or '.fa.gz')!\nLine: '{}'".format(ERROR_STR,line.strip()))
+                    print("{}: Genome field incorrect extension (has to be '.fasta', '.fa', '.fasta.gz' or '.fa.gz')!\nLine: '{}'".format(ERROR_STR,line.strip()))
                     sys.exit(1)
 
-        outLines.append([sample,fastq,barcode,genome])
+        ## CHECK TRANSCRIPTOME ENTRIES
+        gtf = ''
+        is_transcripts = '0'
+        if transcriptome:
+
+            if transcriptome.find(' ') != -1:
+                print("{}: Transcriptome field contains spaces!\nLine: '{}'".format(ERROR_STR,line.strip()))
+                sys.exit(1)
+
+            if transcriptome[-6:] != '.fasta' and transcriptome[-3:] != '.fa' and transcriptome[-9:] != '.fasta.gz' and transcriptome[-6:] != '.fa.gz' and transcriptome[-4:] != '.gtf':
+                print("{}: Transcriptome field incorrect extension (has to be '.fasta', '.fa', '.fasta.gz', '.fa.gz' or '.gtf')!\nLine: '{}'".format(ERROR_STR,line.strip()))
+                sys.exit(1)
+
+            if transcriptome[-4:] == '.gtf':
+                gtf = transcriptome
+                if not genome:
+                    print("{}: If genome isnt provided, transcriptome must be in fasta format for mapping!\nLine: '{}'".format(ERROR_STR,line.strip()))
+                    sys.exit(1)
+            else:
+                is_transcripts = '1'
+                genome = transcriptome
+
+        outLines.append([sample,fastq,barcode,genome,gtf,is_transcripts])
     else:
         fin.close()
         break
@@ -106,7 +128,7 @@
 
 ## WRITE TO FILE
 fout = open(args.DESIGN_FILE_OUT,'w')
-fout.write(','.join(HEADER) + '\n')
+fout.write(','.join(['sample', 'fastq', 'barcode', 'genome', 'gtf', 'is_transcripts']) + '\n')
 for line in outLines:
     fout.write(','.join(line) + '\n')
 fout.close()
diff --git a/bin/gtf2bed b/bin/gtf2bed
@@ -0,0 +1,123 @@
+#!/usr/bin/env perl
+
+# Copyright (c) 2011 Erik Aronesty (erik@q32.com)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# ALSO, IT WOULD BE NICE IF YOU LET ME KNOW YOU USED IT.
+
+use Getopt::Long;
+
+my $extended;
+GetOptions("x"=>\$extended);
+
+$in = shift @ARGV;
+
+my $in_cmd =($in =~ /\.gz$/ ? "gunzip -c $in|" : $in =~ /\.zip$/ ? "unzip -p $in|" : "$in") || die "Can't open $in: $!\n";
+open IN, $in_cmd;
+
+while (<IN>) {
+	$gff = 2 if /^##gff-version 2/;
+	$gff = 3 if /^##gff-version 3/;
+	next if /^#/ && $gff;
+
+	s/\s+$//;
+	# 0-chr 1-src 2-feat 3-beg 4-end 5-scor 6-dir 7-fram 8-attr
+	my @f = split /\t/;
+	if ($gff) {
+        # most ver 2's stick gene names in the id field
+		($id) = $f[8]=~ /\bID="([^"]+)"/;
+        # most ver 3's stick unquoted names in the name field
+		($id) = $f[8]=~ /\bName=([^";]+)/ if !$id && $gff == 3;
+	} else {
+		($id) = $f[8]=~ /transcript_id "([^"]+)"/;
+	}
+
+	next unless $id && $f[0];
+
+	if ($f[2] eq 'exon') {
+		die "no position at exon on line $." if ! $f[3];
+        # gff3 puts :\d in exons sometimes
+        $id =~ s/:\d+$// if $gff == 3;
+		push @{$exons{$id}}, \@f;
+		# save lowest start
+		$trans{$id} = \@f if !$trans{$id};
+	} elsif ($f[2] eq 'start_codon') {
+		#optional, output codon start/stop as "thick" region in bed
+		$sc{$id}->[0] = $f[3];
+	} elsif ($f[2] eq 'stop_codon') {
+		$sc{$id}->[1] = $f[4];
+	} elsif ($f[2] eq 'miRNA' ) {
+		$trans{$id} = \@f if !$trans{$id};
+		push @{$exons{$id}}, \@f;
+	}
+}
+
+for $id (
+	# sort by chr then pos
+	sort {
+		$trans{$a}->[0] eq $trans{$b}->[0] ?
+		$trans{$a}->[3] <=> $trans{$b}->[3] :
+		$trans{$a}->[0] cmp $trans{$b}->[0]
+	} (keys(%trans)) ) {
+		my ($chr, undef, undef, undef, undef, undef, $dir, undef, $attr, undef, $cds, $cde) = @{$trans{$id}};
+        my ($cds, $cde);
+        ($cds, $cde) = @{$sc{$id}} if $sc{$id};
+
+		# sort by pos
+		my @ex = sort {
+			$a->[3] <=> $b->[3]
+		} @{$exons{$id}};
+
+		my $beg = $ex[0][3];
+		my $end = $ex[-1][4];
+
+		if ($dir eq '-') {
+			# swap
+			$tmp=$cds;
+			$cds=$cde;
+			$cde=$tmp;
+			$cds -= 2 if $cds;
+			$cde += 2 if $cde;
+		}
+
+		# not specified, just use exons
+		$cds = $beg if !$cds;
+		$cde = $end if !$cde;
+
+		# adjust start for bed
+		--$beg; --$cds;
+
+		my $exn = @ex;												# exon count
+		my $exst = join ",", map {$_->[3]-$beg-1} @ex;				# exon start
+		my $exsz = join ",", map {$_->[4]-$_->[3]+1} @ex;			# exon size
+
+        my $gene_id;
+        my $extend = "";
+        if ($extended) {
+    	    ($gene_id) = $attr =~ /gene_name "([^"]+)"/;
+    	    ($gene_id) = $attr =~ /gene_id "([^"]+)"/ unless $gene_id;
+            $extend="\t$gene_id";
+        }
+		# added an extra comma to make it look exactly like ucsc's beds
+		print "$chr\t$beg\t$end\t$id\t0\t$dir\t$cds\t$cde\t0\t$exn\t$exsz,\t$exst,$extend\n";
+}
+
+
+close IN;
diff --git a/conf/base.config b/conf/base.config
@@ -71,6 +71,9 @@ process {
   withName:GetChromSizes {
     container = 'quay.io/biocontainers/samtools:1.9--h10a08f8_12'
   }
+  withName:GTFToBED {
+    container = 'quay.io/biocontainers/perl:5.22.0.1--0'
+  }
   withName:MiniMap2Index {
     container = 'quay.io/biocontainers/minimap2:2.17--h8b12597_1'
   }