In [None]:
from intervaltree import Interval, IntervalTree
import re

In [22]:
def unquote(s):
    if len(s) >= 2 and s[0] == '"' and s[-1] == '"':
        return s[1:-1]
    return s

In [30]:
probeset_full = "probesets/all_meta.csv"
allowed_chr = "^chr[0-9]?[0-9]?[XYM]?[+-]$"

In [35]:
def read_intervaltrees():
    headers = {}
    intervaltrees = {}
    with open(probeset_full) as f:
        for line in f:
            line = line.rstrip().split()
            if not headers:
                headers = {key: value for value, key in enumerate(line)}
            else:
                probeset, chromosome, left, right, strand = [line[headers[i]] for i in ["probeset", "chromosome", "left_pos", "right_pos", "strand"]]
                key = chromosome + strand
                if re.match(allowed_chr, key):
                    tree = intervaltrees.setdefault(key, IntervalTree())
                    try:
                        left = int(left)
                        right = int(right)
                        not_zero = left != 0 and right != 0
                        wrong_order = left >= right
                        store = not_zero
                    except ValueError:
                        store = False
                    if store:
                        if wrong_order:
                            raise(ValueError("must not be equal {} {}!".format(left, right)))
                        tree[left:right] = probeset
    return intervaltrees

In [36]:
intervaltrees = read_intervaltrees()

In [117]:
playX = IntervalTree()

In [118]:
playX[10:15] = "hi"

In [119]:
playX.search(9, 16, strict=True)

{Interval(10, 15, 'hi')}

In [40]:
for i in intervaltrees["chr1+"].search(861152, 871276, strict=True):
    print(i.data)

4053420
4053426
4053419
4053418
4053423
4053427
4053421
4053422
4053417
4053425
4053424


In [57]:
possibilities = set()

In [68]:
#filter_probes = 
def filter_probes(intervaltrees):
    filtered_probesets = {}
    with open("annotations/gencode.v26lift37.annotation.gtf") as geneList:
        currentGeneID = None
        currentGenePieces = []
        currentChromosome = None
        leftGene = 0
        rightGene = 0
        for i, line in enumerate(geneList):
            if line and line[0] == "#":
                continue

            line = line.rstrip().split("\t")
            chromo, source, typ, left, right, _, strand, _, meta = line
            left = int(left)
            right = int(right)
            meta = re.split(";[ ]?", meta)

            processed_meta = {}
            for m in meta[:-1]:
                k, v = m.split()
                processed_meta[k] = unquote(v)

            transcript_type =  processed_meta.get("transcript_type", "")
            gene_type = processed_meta.get("gene_type", "")
            
            possibilities.add((transcript_type, gene_type))

            protein_coding = transcript_type == "protein_coding" or gene_type == "protein_coding"
            is_coding_or_UTR = typ in ["CDS", "UTR"]
            key = chromo + strand
            if protein_coding and is_coding_or_UTR and re.match(allowed_chr, key):
                gene_name = processed_meta["gene_name"]
                for j in intervaltrees[key].search(left, right, strict=True):
                    filtered_probesets[j.data] = gene_name
    return filtered_probesets

In [69]:
filter_probes(intervaltrees)

{'3158787': 'RECQL4',
 '2544704': 'DNMT3A',
 '3598496': 'RAB11A',
 '2921128': 'CDC40',
 '3752443': 'UTP6',
 '2811777': 'IPO11',
 '3320629': 'USP47',
 '3158022': 'PARP10',
 '2632310': 'EPHA3',
 '2672736': 'SCAP',
 '3035641': 'SNX8',
 '3406076': 'ATF7IP',
 '2408099': 'OXCT2',
 '2609071': 'GRM7',
 '2975478': 'AHI1',
 '3027273': 'TBXAS1',
 '2585987': 'ABCB11',
 '2515003': 'GAD1',
 '3931810': 'ERG',
 '3891346': 'TUBB1',
 '4000611': 'ACE2',
 '3738356': 'ASPSCR1',
 '3245117': 'ANTXRL',
 '2415952': 'DOCK7',
 '2590749': 'NCKAP1',
 '2336404': 'PRPF38A',
 '2351954': 'DDX20',
 '2831352': 'CXXC5',
 '3646520': 'SEC14L5',
 '2911367': 'ZNF451',
 '3663038': 'TEPP',
 '3660195': 'NOD2',
 '2612107': 'FGD5',
 '2716499': 'NSG1',
 '3888145': 'CSE1L',
 '3293268': 'SAR1A',
 '2495411': 'CNGA3',
 '2779677': 'PPP3CA',
 '3875974': 'PLCB4',
 '3666576': 'UTP4',
 '3593034': 'SLC24A5',
 '2947819': 'OR11A1',
 '3873355': 'FAM110A',
 '2535911': 'GPR35',
 '3974559': 'ATP6AP2',
 '3203939': 'KIF24',
 '3109218': 'SPAG1',
 '3