In [1]:
import os
import os.path
import re

In [2]:
annots = "annotations"

In [3]:
target = "probesets"

In [4]:
os.listdir(annots)

['HuEx-1_0-st-v2.text.cdf',
 'gencode.v26lift37.annotation.gtf',
 'HuEx-1_0-st-v2.na36.hg19.probeset.csv',
 'test.cdf']

In [5]:
chip_annot = "HuEx-1_0-st-v2.text.cdf"

In [6]:
gtf_annot = "gencode.v26lift37.annotation.gtf"

In [7]:
probeset_annot = "HuEx-1_0-st-v2.na36.hg19.probeset.csv"

The CDF file uses .ini syntax, which is supported by the `configparser` standard libarary module in python3.

Unfortunately, the standard library module cannot efficiently deal with our CDF file, and uses over 16 GB of RAM to efficiently parse it. We hand-bake a parser, and test it for a small input.

In [8]:
import os
try:
    os.mkdir(target)
except FileExistsError:
    pass

In [9]:
def load_cdf(path):
    contextDict = {}
    with open(path) as f:
        context = None
        for line in f:
            line = line.rstrip()
            if line:
                c1 = line[0] == "["
                c2 = "Unit" in line
                c3 = "_Block1]" in line
                checkConds = [c1, c2, c3]
                if all(checkConds):
                    context = line[len("Unit") + 1: -1 * len("_Block1") - 1]
                    if context in contextDict:
                        print(line)
                        raise ValueError
                    else:
                        contextDict[context] = {}
                elif context is not None:
                    eqPos = line.find("=")
                    key = line[:eqPos]
                    matchedList = re.findall("Cell[1-9]+", key)
                    if matchedList:
                        value = line[eqPos + 1:].split("\t")
                        contextDict[context][key] = value[:3]
            else:
                context = None
    return contextDict

test case

In [10]:
assert load_cdf(os.path.join(annots, "test.cdf")) == {'2315101': {'Cell1': ['986', '1674', 'ACATTGAATTATGGTGTTGGTCCGT'],
  'Cell2': ['1092', '677', 'GGTAACAAGTAGAAGACCGGGGACA'],
  'Cell3': ['796', '1862', 'GGGACAACAGACGTACATTGAATTA'],
  'Cell4': ['917', '193', 'ACAGAGAATCGGGTCTGAAGGGCAC']},
 '2315102': {'Cell1': ['341', '1677', 'TTCTGCTGCCGGCTGAACCTAGTGT'],
  'Cell2': ['144', '2250', 'GTCTCGACGTCTTCTGCTGCCGGCT'],
  'Cell3': ['689', '262', 'TCGACGTCTTCTGCTGCCGGCTGAA'],
  'Cell4': ['579', '1670', 'ACGTCTTCTGCTGCCGGCTGAACCT']},
 '2315103': {'Cell1': ['587', '2211', 'AGGTCGACCCAGCTGTCTGTCCCCG'],
  'Cell2': ['804', '2550', 'GGAGGTCGACCCAGCTGTCTGTCCC'],
  'Cell3': ['267', '2', 'ACGGAGGTCGACCCAGCTGTCTGTC'],
  'Cell4': ['1123', '1646', 'TCGACCCAGCTGTCTGTCCCCGACC']}}

In [11]:
test_case = load_cdf(os.path.join(annots, "test.cdf"))

In [12]:
def dump_probesets(probesets):
    with open(os.path.join(target, "probeset_coord_seq.csv"), "w") as f:
        for probeset in probesets:
            cells = probesets[probeset]
            for cell in cells:
                print(probeset, *cells[cell], file=f, sep="\t")

In [13]:
probesets = load_cdf(os.path.join(annots, "HuEx-1_0-st-v2.text.cdf"))

In [14]:
dump_probesets(probesets)