# Control Centre

In [11]:
# make sure this doesn't clash with anything else!
redisDB = 1

# this will wipe out only the data generated previously by this script and owned by the script (i.e. only db=redisDB)
wipeDB = True

# you can set this up to 2**64 if you want everything read.
readNo = 2**64

# path to annotation file
annotation = "CEL_files/HuEx-1_0-st-v2.na36.hg19.probeset.csv"

# which line in the file contains the schema
schemaLine = 22

# This is safe to run: it's just some definitions

In [12]:
import redis
import json

r = redis.StrictRedis(host='localhost', port=2050, db=redisDB)

metadataKeys = ["probeset_id", "seqname", "strand", "start", "stop", "probe_count", "transcript_cluster_id", "exon_id", "psr_id", "level"]

def splitOnCommas(line):
    return [e[1:-1] for e in line.rstrip().split(",")]

def processLine(schema, line):
    exampleValues = splitOnCommas(line)
    mapping = {s : e for s, e in zip(schema, exampleValues)}
    usualIDs = set()
    weirdIDs = set()
    try:
        for element in mapping['gene_assignment'].split("///"):
            left, right = element.split("//")
            usualIDs.add(right.strip())
    except Exception:
        usualIDs = set()
    try:
        for element in mapping['mrna_assignment'].split("///"):
            weirdID = element.split("//")[0].strip()
            weirdIDs.add(weirdID)
    except Exception:
        weirdIDs = set()
    probesetID = mapping["probeset_id"]
    transID = mapping["transcript_cluster_id"]
    metadata = {key:value for key, value in mapping.items() if key in metadataKeys}
    return probesetID, transID, metadata, usualIDs, weirdIDs

r.set('main$metadataKeys', json.dumps(metadataKeys))

def upsertWeirdID(weirdID, transID):
    r.sadd("search$weird$" + weirdID, transID)
    r.sadd("search$trans$weird$" + transID, weirdID)
    
def upsertUsualID(usualID, transID):
    r.sadd("search$usual$" + usualID, transID)
    r.sadd("search$trans$usual$" + transID, usualID)
    
def upsertProbesetID(probesetID, transID):
    r.sadd("search$probeset$" + probesetID, transID)

toInt = set(["probeset_id", "start", "stop", "probe_count", "transcript_cluster_id", "exon_id", "psr_id"])

def setMetadata(probesetID, metadata):
    toDump = []
    for key in metadataKeys:
        if key in toInt:
            try:
                toDump.append(int(metadata[key]))
            except ValueError:
                toDump.append(-1)
        else:
            toDump.append(metadata[key])
    args= ["probeset$metadata", probesetID, json.dumps(toDump)]
    r.hset(*args)
    
def upsertTrans(transID, probesetID):
    r.sadd("trans$probeset$" + transID, probesetID)

def writeToRedis(probesetID, transID, metadata, usualIDs, weirdIDs):
    for usualID in usualIDs:
        upsertUsualID(usualID, transID)
    for weirdID in weirdIDs:
        upsertWeirdID(weirdID, transID)
    upsertProbesetID(probesetID, transID)
    setMetadata(probesetID, metadata)
    upsertTrans(transID, probesetID)

# Careful -- this can potentially wipe the DB (but will rebuild it afterwards)

In [3]:
if wipeDB:
    r.flushdb()
with open(annotation) as f:
    schema = []
    exampleValues = []
    j = schemaLine
    k = readNo
    for i, line in enumerate(f):
        if i == j:
            schema = splitOnCommas(line)
        elif i > j:
            resultTuple = processLine(schema, line)
            writeToRedis(*resultTuple)
        if i == j + readNo + 1:
            break

# Example usage

In [4]:
r.smembers(b'trans$probeset$' + list(r.smembers(b'search$probeset$2315105'))[0])

{b'2315101', b'2315102', b'2315103', b'2315104', b'2315105'}

In [8]:
r.smembers(b'trans$probeset$2315100')

{b'2315101', b'2315102', b'2315103', b'2315104', b'2315105'}

In [3]:
r.smembers(b'trans$probeset$' + list(r.smembers('search$usual$DDX11L1'))[0])

NameError: name 'r' is not defined

In [7]:
r.smembers(b'trans$probeset$' + b"2648232")

{b'2648233', b'2648234'}

In [8]:

r.hget(b'probeset$metadata', b'2315105')

b'[2315105, "chr1", "+", 14150, 14368, 4, 2315100, 4, 5, "extended"]'

In [19]:
probeset = b'2315105'
metadataKeys = json.loads(r.get(b'main$metadataKeys').decode("ascii", errors="ignore"))
metadataToIndex = {key:i for i, key in enumerate(metadataKeys)}

def checkProbesetLevel(probeset):
    metadata = json.loads(r.hget(b'probeset$metadata', probeset).decode("ascii", errors="ignore"))
    result = metadata[metadataToIndex["level"]]
    return result

checkProbesetLevel(probeset)

'extended'

In [14]:
r.hget(b'probeset$metadata', b'2614449')

b'[2614449, "chr3", "+", 25305161, 25305417, 4, 2614448, 186212, 243542, "full"]'

In [None]:
dontprint = r.smembers(b'trans$probeset$' + list(r.smembers('search$probeset$3695552'))[0])

In [15]:
r.smembers('search$probeset$2819466')

{b'2819436'}

In [None]:
r.smembers(b"search$trans$usual$" + b"2315100")

In [16]:
r.smembers(b"search$trans$weird$" + b"2315100")

{b'---',
 b'AK093685',
 b'AK125998',
 b'BC070227',
 b'ENST00000437401',
 b'ENST00000450305',
 b'ENST00000456328',
 b'ENST00000507418',
 b'ENST00000513886',
 b'ENST00000559159',
 b'ENST00000562189',
 b'ENST00000624431',
 b'GENSCAN00000010471',
 b'GENSCAN00000017672',
 b'NONHSAT000001',
 b'NONHSAT000002',
 b'NONHSAT000003',
 b'NONHSAT000004',
 b'NONHSAT051714',
 b'NONHSAT051715',
 b'NONHSAT051716',
 b'NONHSAT051717',
 b'NONHSAT051719',
 b'NONHSAT051720',
 b'NONHSAT051721',
 b'NONHSAT073809',
 b'NONHSAT073810',
 b'NONHSAT073811',
 b'NONHSAT073812',
 b'NONHSAT073813',
 b'NONHSAT129876',
 b'NONHSAT129877',
 b'NONHSAT139258',
 b'NONHSAT139259',
 b'NR_024004',
 b'NR_024005',
 b'NR_034090',
 b'NR_045117',
 b'NR_046018',
 b'NR_051985',
 b'NR_051986',
 b'NR_110561',
 b'OTTHUMT00000002844',
 b'OTTHUMT00000058841',
 b'OTTHUMT00000109036',
 b'OTTHUMT00000362751',
 b'OTTHUMT00000417614',
 b'OTTHUMT00000417615',
 b'OTTHUMT00000420565',
 b'TCONS_l2_00010384-XLOC_l2_005087',
 b'TCONS_l2_00010385-XLOC_l

In [None]:
r.save()

In [17]:
r.smembers('search$weird$OTTHUMT00000058841')

{b'2315100', b'3642555', b'3642560', b'3642566'}