# Separately quantile normalise blood and muscle data. 

This step uses ~32 GB of RAM. We've used AWS m5.4xlarge with 64 GB of RAM to run this step

In [1]:
import os
import os.path
import numpy
import math
import time

def sortDist(d):
    sortedd = [(v, i) for i, v in enumerate(d)]
    sortedd.sort()
    return sortedd

def avgDist(args):
    toReturn = []
    for tuplas in zip(*args):
        toAdd = float(0)
        for v, _ in tuplas:
            toAdd += v
        toAdd /= len(tuplas)
        toReturn.append(toAdd)
    return toReturn

def quantileNormalise(args):
    args = [sortDist(d) for d in args]
    avgd = avgDist(args)
    toReturn = []
    for dist in args:
        normDist = [(i, a) for a, (v, i) in zip(avgd, dist)]
        normDist.sort()
        normDist = [j for (i, j) in normDist]
        yield(normDist)

d1 = [10, 9, 11, 23]
d2 = [4, 6, 7, 5]

assert(list(quantileNormalise([d1, d2])) == [[7.5, 6.5, 8.5, 15], [6.5, 8.5, 15, 7.5]])

In [2]:
def load_metadata():
    metadata = {}
    with open("metadata.txt") as f:
        for i, line in enumerate(f):
            line = line.strip().split()
            if i == 0:
                names = line[1:]
            else:
                values = line[1:]
                metadata[line[0]] = {k: v for k, v in zip(names, values)}
    return metadata

In [3]:
metadata = load_metadata()
blood_CELs = [metadata[i]["blood_cel"] for i in metadata]

In [4]:
muscle_CELs = [metadata[i]["muscle_cel"] for i in metadata if metadata[i]["muscle_cel"] != "refused_biopsy"]

In [5]:
def load_intensity_data(paths):
    intensities = []
    for path in paths:
        with open(path) as f:
            intensity = numpy.loadtxt(f)
            intensities.append(intensity)
    return intensities

In [6]:
def dump_qn(filenames):
    prefix = "intensities"
    qn_result = "qn"
    intensities = [os.path.join(prefix, i + ".txt") for i in filenames]
    data = load_intensity_data(intensities)
    rows, columns = data[0].shape
    data = [i.reshape(rows*columns) for i in data]
    data = [numpy.vectorize(math.log)(i) for i in data]
    try:
        os.mkdir(qn_result)
    except FileExistsError:
        pass
    start = time.time()
    for i, result in enumerate(quantileNormalise(data)):
        with open(os.path.join(qn_result, os.path.split(intensities[i])[1]), "w") as f:
            result = numpy.array(result).reshape(rows, columns)
            for row in result:
                    for element in row:
                        print(str(element), end=" ", file=f)
                    print(file=f)
    stop = time.time()
    print(stop - start)

In [8]:
dump_qn(muscle_CELs)

814.5898406505585


In [9]:
dump_qn(blood_CELs)

1101.1111648082733


In [10]:
len(blood_CELs) + len(muscle_CELs)

62

In [29]:
CELs_from_intensities = set([i.split(".")[0] for i in os.listdir("intensities")])

In [30]:
CELs_from_metadata = set(i.split(".")[0] for i in blood_CELs + muscle_CELs)

In [31]:
CELs_from_intensities.difference(CELs_from_metadata)

{'111747589_MR',
 '117440822_MR',
 '159834720_M',
 '204472077_M',
 '230974357_M',
 '360448352_MR',
 '377666471_MR',
 '387939296_MR',
 '406335477_M',
 '419550533_M',
 '419550533_MR',
 '572448109_MR',
 '597785396_M',
 '881676366_M'}

In [32]:
CELs_from_metadata.difference(CELs_from_intensities)

set()