In [1]:
from intervaltree import Interval, IntervalTree
import re

In [2]:
def unquote(s):
    if len(s) >= 2 and s[0] == '"' and s[-1] == '"':
        return s[1:-1]
    return s

In [3]:
probeset_full = "probesets/probesets_correct_coords.csv"

genecode_file = "annotations/gencode.v26lift37.annotation.gtf"

result_file = "probesets/annotated_probesets.csv"
allowed_chr = "^chr[0-9]?[0-9]?[XYM]?[+-]$"

In [4]:
def read_intervaltrees():
    headers = {}
    intervaltrees = {}
    with open(probeset_full) as f:
        for i, line in enumerate(f):
            line = line.rstrip().split()
            if not headers:
                headers = {key: value for value, key in enumerate(line)}
            else:
                probeset, chromosome, left, right, strand = [line[headers[i]] for i in ["probeset_id", "chrom", "genocode_left", "genecode_right", "strand"]]
                key = chromosome + strand
                tree = intervaltrees.setdefault(key, IntervalTree())
                try:
                    left = int(left)
                    right = int(right)
                    not_zero = left != 0 and right != 0
                    wrong_order = left >= right
                    store = not_zero
                except ValueError:
                    store = False
                if store:
                    if wrong_order:
                        raise(ValueError("must not be equal {} {}!".format(left, right)))
                    tree[left:right] = probeset
    return intervaltrees

In [5]:
intervaltrees = read_intervaltrees()

In [6]:
for i in intervaltrees["chr6+"].search(168072610, 168072791, strict=True):
    print(i.data)

2936520
2936520
2936520
2936520


In [7]:
possibilities = set()

In [8]:
#filter_probes = 
def filter_probes(intervaltrees, genecode_file):
    filtered_probesets = {}
    with open(genecode_file) as geneList:
        currentGeneID = None
        currentGenePieces = []
        currentChromosome = None
        leftGene = 0
        rightGene = 0
        for i, line in enumerate(geneList):
            if line and line[0] == "#":
                continue

            line = line.rstrip().split("\t")
            chromo, source, typ, left, right, _, strand, _, meta = line
            left = int(left)
            right = int(right)
            meta = re.split(";[ ]?", meta)

            processed_meta = {}
            for m in meta[:-1]:
                k, v = m.split()
                processed_meta[k] = unquote(v)

            transcript_type =  processed_meta.get("transcript_type", "")
            gene_type = processed_meta.get("gene_type", "")
            
            possibilities.add((transcript_type, gene_type, typ))

            protein_coding = transcript_type == "protein_coding" or gene_type == "protein_coding"
            is_coding_or_UTR = typ in ["CDS", "UTR"]
            key = chromo + strand
            if protein_coding and is_coding_or_UTR and re.match(allowed_chr, key):
                gene_name = processed_meta["gene_name"]
                for j in intervaltrees[key].search(left, right, strict=True):
                    filtered_probesets[j.data] = gene_name
    return filtered_probesets

In [9]:
result = filter_probes(intervaltrees, genecode_file)

In [10]:
def write_strict_annotation(filename_in, probe_to_genename, filename_out):
    headers = {}
    with open(filename_out, "w") as f_out:
        print("gene_name", "probeset_id", "seq5to3plus", "chrom", "strand", "genocode_left", "genecode_right", "x", "y", sep="\t", file=f_out)
        with open(filename_in) as f:
            for i, line in enumerate(f):
                line = line.rstrip().split()
                if not headers:
                    headers = {key: value for value, key in enumerate(line)}
                else:
                    probeset_id, seq5to3plus, chrom, genocode_left, genecode_right, strand, x, y = [line[headers[i]] for i in ["probeset_id", "seq5to3plus", "chrom", "genocode_left", "genecode_right", "strand", "x", "y"]]
                    if probeset_id in probe_to_genename:
                        print(probe_to_genename[probeset_id], probeset_id, seq5to3plus, chrom, strand, genocode_left, genecode_right, x, y, sep="\t", file=f_out)

In [11]:
write_strict_annotation(probeset_full, result, result_file)

In [12]:
possibilities

{('', '3prime_overlapping_ncRNA', 'gene'),
 ('', 'IG_C_gene', 'gene'),
 ('', 'IG_C_pseudogene', 'gene'),
 ('', 'IG_D_gene', 'gene'),
 ('', 'IG_J_gene', 'gene'),
 ('', 'IG_J_pseudogene', 'gene'),
 ('', 'IG_V_gene', 'gene'),
 ('', 'IG_V_pseudogene', 'gene'),
 ('', 'IG_pseudogene', 'gene'),
 ('', 'Mt_rRNA', 'gene'),
 ('', 'Mt_tRNA', 'gene'),
 ('', 'TEC', 'gene'),
 ('', 'TR_C_gene', 'gene'),
 ('', 'TR_D_gene', 'gene'),
 ('', 'TR_J_gene', 'gene'),
 ('', 'TR_J_pseudogene', 'gene'),
 ('', 'TR_V_gene', 'gene'),
 ('', 'TR_V_pseudogene', 'gene'),
 ('', 'antisense', 'gene'),
 ('', 'bidirectional_promoter_lncRNA', 'gene'),
 ('', 'lincRNA', 'gene'),
 ('', 'macro_lncRNA', 'gene'),
 ('', 'miRNA', 'gene'),
 ('', 'misc_RNA', 'gene'),
 ('', 'non_coding', 'gene'),
 ('', 'polymorphic_pseudogene', 'gene'),
 ('', 'processed_pseudogene', 'gene'),
 ('', 'processed_transcript', 'gene'),
 ('', 'protein_coding', 'gene'),
 ('', 'pseudogene', 'gene'),
 ('', 'rRNA', 'gene'),
 ('', 'scRNA', 'gene'),
 ('', 'sense_int