In [25]:
import os
import os.path
import re
import numpy

In [26]:
nakamori_genes = set()
with open("annotations/nakamori_genes.txt") as f:
    for line in f:
        line = line.rstrip()
        if line in nakamori_genes:
            print(line)
        nakamori_genes.add(line)

LDB3
DMD


In [33]:
nakamori_genes = set(["TNNI1"])

In [34]:
genecode_genes = set(os.listdir("experiment_blood"))

In [35]:
len(nakamori_genes)

1

In [36]:
len(genecode_genes)

19826

# Genes we can't remap

In [38]:
nakamori_genes.difference(nakamori_genes.intersection(genecode_genes))

set()

# load metadata

In [39]:
# copied from quantile_normalise
def load_metadata():
    metadata = {}
    metadata_order = []
    with open("metadata.txt") as f:
        for i, line in enumerate(f):
            line = line.strip().split()
            if i == 0:
                names = line[1:]
            else:
                values = line[1:]
                patient_id = line[0]
                metadata_order.append(patient_id)
                metadata[patient_id] = {k: v for k, v in zip(names, values)}
    return metadata_order, metadata

metadata_order, metadata = load_metadata()
blood_IDs = [i for i in metadata_order]
blood_CELs = [metadata[i]["blood_cel"] for i in blood_IDs]
blood_MA = [metadata[i]["modal_allele"] for i in blood_IDs]
blood_MA = [int(i) for i in blood_MA]
blood_PA = [metadata[i]["progenitor_allele"] for i in blood_IDs]
blood_MIRS = [metadata[i]["MIRS"] for i in blood_IDs]


muscle_IDs = [i for i in metadata_order if metadata[i]["muscle_cel"] != "refused_biopsy"]
muscle_CELs = [metadata[i]["muscle_cel"] for i in muscle_IDs]
muscle_MA = [metadata[i]["modal_allele"] for i in muscle_IDs]
muscle_MA = [int(i) for i in muscle_MA]

muscle_PA = [metadata[i]["progenitor_allele"] for i in muscle_IDs]
muscle_MIRS = [metadata[i]["MIRS"] for i in muscle_IDs]

In [40]:
print(metadata_order)

['111747589', '117440822', '124563003', '129523253', '141772399', '159834720', '204472077', '213653686', '229213757', '230974357', '270148799', '315805040', '321962190', '328687703', '360448352', '377666471', '387939296', '406335477', '420299717', '427374914', '449599671', '473208969', '523950865', '549452228', '551644041', '572448109', '575039926', '597785396', '661252781', '736551759', '819054051', '830225708', '873750289', '881676366', '896445336']


In [41]:
import itertools

In [42]:
def produce_data(blood_IDs, gene_names, experiment):
    probe_data = []
    probe_IDs = []
    for gene in gene_names:
        with open(os.path.join(experiment, gene)) as f:
            for i, line in enumerate(f):
                line = line.rstrip().split()
                if i == 0:
                    prefix = "patient_"
                    our_blood_IDs = [elem[len(prefix):] for elem in line if re.match(prefix, elem)]
                    assert blood_IDs == our_blood_IDs
                    headers = {header: i for i, header in enumerate(line)}
                    patient_data = {header[len(prefix):]: i for i, header in enumerate(line) if re.match(prefix, header)}
                    def write_signature(line):
                        signature = []
                        for elem in ["gene_name", "probeset_id", "seq5to3plus", "chrom", "strand", "genocode_left", "genecode_right"]:
                            signature.append(line[headers[elem]])
                        return "_".join(signature)
                else:
                    probe_ID = write_signature(line)
                    rv = []
                    for patient_id in blood_IDs:
                        rv.append(float(line[patient_data[patient_id]]))
                    probe_data.append(rv)
                    probe_IDs.append(probe_ID)
    probe_data = numpy.array(probe_data)
    return probe_data, probe_IDs

In [43]:
blood_data, blood_ids = produce_data(blood_IDs, nakamori_genes.intersection(genecode_genes), "experiment_blood")

In [44]:
muscle_data, muscle_ids = produce_data(muscle_IDs, nakamori_genes.intersection(genecode_genes), "experiment_muscle")

In [45]:
try:
    btarget = "blood_predictions"
    os.mkdir(btarget)
except FileExistsError:
    pass

In [46]:
try:
    mtarget = "muscle_predictions"
    os.mkdir(mtarget)
except FileExistsError:
    pass

In [47]:
numpy.savetxt(os.path.join(btarget, "blood_data"), blood_data)
numpy.savetxt(os.path.join(mtarget, "muscle_data"), muscle_data)

In [48]:
import json
with open(os.path.join(btarget, "blood_ids"), "w") as f_out:
    json.dump(blood_ids, f_out)

with open(os.path.join(btarget, "blood_rt"), "w") as f_out:
    json.dump(blood_MA, f_out)

In [49]:
with open(os.path.join(mtarget, "muscle_ids"), "w") as f_out:
    json.dump(muscle_ids, f_out)

with open(os.path.join(mtarget, "muscle_rt"), "w") as f_out:
    json.dump(muscle_MA, f_out)