Merge pull request #869 from maxplanck-ie/filtergtf

Filtergtf
maxplanck-ie · Jan 23, 2023 · ac3dffd · ac3dffd
2 parents 460ef77 + 4e2b415
commit ac3dffd
Show file tree

Hide file tree

Showing 5 changed files with 11 additions and 12 deletions.
diff --git a/docs/content/News.rst b/docs/content/News.rst
@@ -7,6 +7,7 @@ snakePipes 2.7.2
 * STAR command has been updated. Now, STAR itself offers a command line option for processing input files.
 * Put a cap on python version for the deeptools env. The current version of deeptools is not supporting the newer python versions and some tools fail.
 * Update default condaDir.
+* The filter_gtf function has become a bit more versatile. GTF files that include delimiters (';') in e.g. a description field are now allowed. Gene names are also allowed to have symbols now. Lastly, GTF files that have xRNA instead of transcript as a feature in column 3 can also be parsed.
 
 snakePipes 2.7.1
 ----------------

diff --git a/docs/content/running_snakePipes.rst b/docs/content/running_snakePipes.rst
@@ -35,7 +35,7 @@ All individual jobs of the workflow will be submitted to the Grid engine using t
 
 **To run the workflow locally**, use the parameter ``--local`` for local mode and the parameter ``-j 10`` to specify the maximal number of used CPU threads (here: 10).
 
-**For single-end FASTQ files**, the workflow automatically recognized single suffix (eg. "sample1.fastq" instead of "sample1_R1.fastq") as single-end reads. However, mixing of single and paired-end files in the same folder is not supported currently.
+**For single-end FASTQ files**, Note that single end data still needs a valid suffix (e.g. sample1_R1.fastq.gz). With a proper suffix, single end mode is detected by default. When executing some workflows with the ``--fromBAM`` flag, it is still necessary to set ``--singleEnd``.
 
 Once the DNA-mapping run is finished sucessfully. We can run the ChIP-seq analysis in the same directory.
 

diff --git a/snakePipes/shared/rscripts/merge_featureCounts.R b/snakePipes/shared/rscripts/merge_featureCounts.R
@@ -16,7 +16,7 @@ isallelic <- function(x) {
 get_df <- function(infile) {
   cat(infile, "\n")
   bname = gsub(".counts.txt" , "" , basename(infile) )
-  df = read.table(infile, header=T)
+  df = read.table(infile, header=T, sep='\t')
 
   if(isallelic(df) == TRUE) {
   print("Counts are allele-specific")

diff --git a/snakePipes/shared/rules/filterGTF.snakefile b/snakePipes/shared/rules/filterGTF.snakefile
@@ -38,6 +38,7 @@ rule gtf_to_files:
         "Annotation/genes.filtered.bed"
     run:
         import shlex
+        import re
 
         t2g = open(output[0], "w")
         symbol = open(output[1], "w")
@@ -47,9 +48,9 @@ rule gtf_to_files:
             if line.startswith("#"):
                 continue
             cols = line.strip().split("\t")
+            annos = re.split(''';(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''', cols[8]) 
             if cols[2] == "gene":
                 # get the gene_name and gene_id values
-                annos = cols[8].split(";")
                 gene_id = None
                 gene_name = None
                 for anno in annos:
@@ -62,9 +63,8 @@ rule gtf_to_files:
                         gene_name = anno[1]
                 if gene_id:
                     symbol.write("{}\t{}\n".format(gene_id, "" if not gene_name else gene_name))
-            elif cols[2] == "transcript":
+            elif cols[2] == "transcript" or 'RNA' in cols[2]:
                 # get the gene_id and transcript_id values
-                annos = cols[8].split(";")
                 gene_id = None
                 transcript_id = None
                 gene_name = ""
@@ -84,15 +84,14 @@ rule gtf_to_files:
                     GTFdict[transcript_id] = [cols[0], cols[3], cols[4], cols[6], [], []]
             elif cols[2] == "exon":
                 # get the transcript_id
-                annos = cols[8].split(";")
                 transcript_id = None
                 for anno in annos:
                     anno = shlex.split(anno.strip(), " ")
                     if len(anno) == 0:
                         continue
                     if anno[0] == "transcript_id":
                         transcript_id = anno[1]
-                if transcript_id:
+                if transcript_id and transcript_id in GTFdict:
                     exonWidth = int(cols[4]) - int(cols[3]) + 1
                     exonOffset = int(cols[3]) - int(GTFdict[transcript_id][1])
                     GTFdict[transcript_id][4].append(str(exonWidth))

diff --git a/snakePipes/shared/rules/filterGTF_spikein.snakefile b/snakePipes/shared/rules/filterGTF_spikein.snakefile
@@ -38,6 +38,7 @@ rule spikein_gtf_to_files:
         "Annotation_spikein/genes.filtered.bed"
     run:
         import shlex
+        import re
 
         t2g = open(output[0], "w")
         symbol = open(output[1], "w")
@@ -47,9 +48,9 @@ rule spikein_gtf_to_files:
             if line.startswith("#"):
                 continue
             cols = line.strip().split("\t")
+            annos = re.split(''';(?=(?:[^'"]|'[^']*'|"[^"]*")*$)''', cols[8]) 
             if cols[2] == "gene":
                 # get the gene_name and gene_id values
-                annos = cols[8].split(";")
                 gene_id = None
                 gene_name = None
                 for anno in annos:
@@ -62,9 +63,8 @@ rule spikein_gtf_to_files:
                         gene_name = anno[1]
                 if gene_id:
                     symbol.write("{}\t{}\n".format(gene_id, "" if not gene_name else gene_name))
-            elif cols[2] == "transcript":
+            elif cols[2] == "transcript" or 'RNA' in cols[2]:
                 # get the gene_id and transcript_id values
-                annos = cols[8].split(";")
                 gene_id = None
                 transcript_id = None
                 gene_name = ""
@@ -84,15 +84,14 @@ rule spikein_gtf_to_files:
                     GTFdict[transcript_id] = [cols[0], cols[3], cols[4], cols[6], [], []]
             elif cols[2] == "exon":
                 # get the transcript_id
-                annos = cols[8].split(";")
                 transcript_id = None
                 for anno in annos:
                     anno = shlex.split(anno.strip(), " ")
                     if len(anno) == 0:
                         continue
                     if anno[0] == "transcript_id":
                         transcript_id = anno[1]
-                if transcript_id:
+                if transcript_id and transcript_id in GTFdict:
                     exonWidth = int(cols[4]) - int(cols[3]) + 1
                     exonOffset = int(cols[3]) - int(GTFdict[transcript_id][1])
                     GTFdict[transcript_id][4].append(str(exonWidth))