In [1]:
import os
import os.path
import re
import boto3

In [2]:
def download_files_from_s3(bucket, prefix):
    s3 = boto3.client('s3')
    list_CELs = s3.list_objects(Bucket=bucket, Prefix=prefix)['Contents']
    for key in list_CELs:
        filename = key["Key"]
        file_exists = os.path.isfile(filename)
        if file_exists:
            print("file {filename} exists locally".format(filename=filename))
        else:
            print("downloading {filename}".format(filename=filename))
            s3.download_file(Bucket='dm1-biomarkers', Key=filename, Filename=filename)

In [3]:
data = "CEL"
annots = "annotations"
target = "probesets"
bucket = "dm1-biomarkers"
if data not in os.listdir("."):
    os.mkdir(data)
if annots not in os.listdir("."):
    os.mkdir(annots)

In [4]:
download_files_from_s3(bucket, data)

file CEL/111747589_B.CEL exists locally
file CEL/111747589_M.CEL exists locally
file CEL/111747589_MR.cel exists locally
file CEL/117440822_B.CEL exists locally
file CEL/117440822_M.CEL exists locally
file CEL/117440822_MR.cel exists locally
file CEL/124563003_B.CEL exists locally
file CEL/124563003_M.CEL exists locally
file CEL/129523253_B.CEL exists locally
file CEL/129523253_M.CEL exists locally
file CEL/141772399_B.CEL exists locally
file CEL/159834720_B.CEL exists locally
file CEL/159834720_M.CEL exists locally
file CEL/159834720_MR.cel exists locally
file CEL/204472077_B.CEL exists locally
file CEL/204472077_M.CEL exists locally
file CEL/213653686_B.CEL exists locally
file CEL/229213757_B.CEL exists locally
file CEL/230974357_B.CEL exists locally
file CEL/230974357_M.CEL exists locally
file CEL/230974357_MR.cel exists locally
file CEL/270148799_B.CEL exists locally
file CEL/270148799_M.CEL exists locally
file CEL/315805040_B.CEL exists locally
file CEL/315805040_M.CEL exists loca

In [5]:
download_files_from_s3(bucket, annots)

file annotations/GRCh37.primary_assembly.genome.fa exists locally
file annotations/HuEx-1_0-st-v2.na36.hg19.probeset.csv exists locally
file annotations/HuEx-1_0-st-v2.text.cdf exists locally
file annotations/gencode.v26lift37.annotation.gtf exists locally
file annotations/test.cdf exists locally


In [6]:
chip_annot = "HuEx-1_0-st-v2.text.cdf"

In [7]:
gtf_annot = "gencode.v26lift37.annotation.gtf"

In [8]:
probeset_annot = "HuEx-1_0-st-v2.na36.hg19.probeset.csv"

The CDF file uses .ini syntax, which is supported by the `configparser` standard libarary module in python3.

Unfortunately, the standard library module cannot efficiently deal with our CDF file, and uses over 16 GB of RAM to efficiently parse it. We hand-bake a parser, and test it for a small input.

In [9]:
import os
try:
    os.mkdir(target)
except FileExistsError:
    pass

In [10]:
def load_cdf(path):
    contextDict = {}
    with open(path) as f:
        context = None
        for line in f:
            line = line.rstrip()
            if line:
                c1 = line[0] == "["
                c2 = "Unit" in line
                c3 = "_Block1]" in line
                checkConds = [c1, c2, c3]
                if all(checkConds):
                    context = line[len("Unit") + 1: -1 * len("_Block1") - 1]
                    if context in contextDict:
                        print(line)
                        raise ValueError
                    else:
                        contextDict[context] = {}
                elif context is not None:
                    eqPos = line.find("=")
                    key = line[:eqPos]
                    matchedList = re.findall("Cell[1-9]+", key)
                    if matchedList:
                        value = line[eqPos + 1:].split("\t")
                        contextDict[context][key] = value[:3]
            else:
                context = None
    return contextDict

test case

In [11]:
assert load_cdf(os.path.join(annots, "test.cdf")) == {'2315101': {'Cell1': ['986', '1674', 'ACATTGAATTATGGTGTTGGTCCGT'],
  'Cell2': ['1092', '677', 'GGTAACAAGTAGAAGACCGGGGACA'],
  'Cell3': ['796', '1862', 'GGGACAACAGACGTACATTGAATTA'],
  'Cell4': ['917', '193', 'ACAGAGAATCGGGTCTGAAGGGCAC']},
 '2315102': {'Cell1': ['341', '1677', 'TTCTGCTGCCGGCTGAACCTAGTGT'],
  'Cell2': ['144', '2250', 'GTCTCGACGTCTTCTGCTGCCGGCT'],
  'Cell3': ['689', '262', 'TCGACGTCTTCTGCTGCCGGCTGAA'],
  'Cell4': ['579', '1670', 'ACGTCTTCTGCTGCCGGCTGAACCT']},
 '2315103': {'Cell1': ['587', '2211', 'AGGTCGACCCAGCTGTCTGTCCCCG'],
  'Cell2': ['804', '2550', 'GGAGGTCGACCCAGCTGTCTGTCCC'],
  'Cell3': ['267', '2', 'ACGGAGGTCGACCCAGCTGTCTGTC'],
  'Cell4': ['1123', '1646', 'TCGACCCAGCTGTCTGTCCCCGACC']}}

In [12]:
test_case = load_cdf(os.path.join(annots, "test.cdf"))

In [13]:
def dump_probesets(probesets):
    with open(os.path.join(target, "probeset_coord_seq.csv"), "w") as f:
        for probeset in probesets:
            cells = probesets[probeset]
            for cell in cells:
                print(probeset, *cells[cell], file=f, sep="\t")

In [17]:
probesets = load_cdf(os.path.join(annots, "HuEx-1_0-st-v2.text.cdf"))

In [18]:
dump_probesets(probesets)

In [19]:
del probesets