In [1]:
def unquote(s):
    if len(s) >= 2 and s[0] == '"' and s[-1] == '"':
        return s[1:-1]
    return s

In [2]:
genome_metadata = "annotations/HuEx-1_0-st-v2.na36.hg19.probeset.csv"
chip_metadata = "probesets/probeset_coord_seq.csv"
result = "probesets/all_meta.csv"

In [3]:
def read_genome_metadata(metadata_path):
    probeset_csv = {}
    headers = None
    with open(metadata_path) as f:
        for line in f:
            if line and line[0] == "#":
                continue
            line = line.rstrip().split(",")
            line = [unquote(i) for i in line]
            if headers is None:
                headers = {k: i for i, k in enumerate(line)}
            else:
                probeset = headers["probeset_id"]
                chromosome = headers["seqname"]
                strand = headers["strand"]
                start = headers["start"]
                stop = headers["stop"]
                probeset_csv[line[probeset]] = [line[i] for i in [chromosome, strand, start, stop]]
    return probeset_csv

In [4]:
probeset_csv = read_genome_metadata(genome_metadata)

In [5]:
def read_chip_metadata(chip_path):
    probeset_chip = {}
    with open(chip_path) as f:
        for line in f:
            probeset_id, x, y, seq = line.rstrip().split("\t")
            probesets = probeset_chip.setdefault(probeset_id, [])
            probesets.append([seq, x, y])
    return probeset_chip

In [6]:
probeset_chip = read_chip_metadata(chip_metadata)

In [7]:
def write_all_metadata(path):
    with open(path, "w") as f:
        print("probeset", "seq", "x", "y", "strand", "chromosome", "left_pos", "right_pos", file=f, sep="\t")
        for probeset in probeset_chip:
            for probe in probeset_chip[probeset]:
                seq, x, y = probe
                try:
                    chromosome, strand, left, right = probeset_csv[probeset]
                except KeyError:
                    # These are non-genomic probes
                    pass
                print(probeset, seq, x, y, strand, chromosome, left, right, file=f, sep="\t")



In [8]:
write_all_metadata(result)