# Analysis of generated microarray data

In [1]:
import numpy as np
import scipy.stats

filename = "generated_data"
with open(filename) as f:
    rowID = None
    while rowID != "Affymetrix":
        line = f.readline().rstrip().split()
        rowID = line[0]
        if rowID == "IDs":
            columns = range(len(line))[1:]
            ids = line[1:]
        if rowID == "ModalAllele":
            MODAL_ALLELE = [int(i) for i in line[1:]]
        if rowID == "ActualMAL":
            ACTUAL_MAL = [int(i) for i in line[1:]]
        if rowID == "Affymetrix":
            AFFECTED_GENES = [i for i in line[1:]]
            CLEANED = np.loadtxt(f, delimiter="\t", usecols=columns)

In [2]:
CLEANED.shape

(200000, 30)

In [3]:
T_CLUSTER_IDS = []

MAP_T_CLUSTER_INDEX = {}

read = False
with open(filename) as f:
    lines = f.readlines()
    i = 0
    for line in lines:
        line = line.rstrip().split("\t")
        if read:
            T_CLUSTER_IDS.append(line[0])
            MAP_T_CLUSTER_INDEX[line[0]] = i
            i += 1
        if line:
            if line[0] == "Affymetrix":
                read = True    

In [5]:
def numpyAnalysis(CLEANED):
    RESIDUALS = np.zeros_like(CLEANED)

    rowNo, columnNo = CLEANED.shape

    GENE_PVALUES = []

    for i in range(rowNo):
        slope, intercept, r_value, p_value, _ = scipy.stats.linregress(MODAL_ALLELE, list(CLEANED[i]))
        GENE_PVALUES.append(p_value)
        expected = [slope * value + intercept for value in MODAL_ALLELE]
        residuals = [actual - expected for actual, expected in zip(CLEANED[i], expected)]
        for j, residual in enumerate(residuals):
            RESIDUALS[i][j] = residual

    RESIDUALS_PVALUES = []
    ISEs = []
    for i in range(columnNo):
        p_value = scipy.stats.ttest_1samp(RESIDUALS[..., i], 0)[1]
        ISEs.append(sum(RESIDUALS[..., i])/rowNo)
        RESIDUALS_PVALUES.append(p_value)
    return (CLEANED, RESIDUALS, RESIDUALS_PVALUES, MODAL_ALLELE, ISEs, GENE_PVALUES)

In [6]:
NON_ADJUSTED = numpyAnalysis(CLEANED)

CLEANED, RESIDUALS, RESIDUALS_PVALUES, MODAL_ALLELE, ISEs, GENE_PVALUES = NON_ADJUSTED

_ = CLEANED # RMA normalised Affymetrix data
_ = RESIDUALS # Residuals computed from MODAL_ALLELE
_ = RESIDUALS_PVALUES # Existence of Individual Specific Effects (ISEs).
_ = MODAL_ALLELE # Modal allele length
_ = ISEs # Individual Specific Effects.
_ = GENE_PVALUES # P-value for each gene

## Mean-normalisation.

In [7]:
# First, compute MEAN_GENE_ADJUSTMENTS

MEAN_GEL = [] # mean gene expression levels, per individual.

for i in range(CLEANED.shape[1]):
    MEAN_GEL.append(sum(CLEANED[...,i])/CLEANED.shape[0])
MEAN_MEAN_GEL = sum(MEAN_GEL)/len(MEAN_GEL)

MEAN_GENE_ADJUSTMENT = []

for k in range(len(MEAN_GEL)):
    MEAN_GENE_ADJUSTMENT.append(MEAN_GEL[k] - MEAN_MEAN_GEL)

print(MEAN_GEL[:5])

[-0.0018936745016989848, -0.0021945193734373316, 0.0033152928639780966, -0.0007943493933158047, -0.0015405297270091581]


In [8]:
# Then, compute ADJUSTED_CLEANED

ADJUSTED_CLEANED = np.copy(CLEANED)
for i in range(CLEANED.shape[0]):
    for j in range(CLEANED.shape[1]):
        ADJUSTED_CLEANED[i][j] -= MEAN_GENE_ADJUSTMENT[j]

adjusted_means = []

for j in range(ADJUSTED_CLEANED.shape[1]):
    adjusted_means.append(sum(ADJUSTED_CLEANED[:,j])/ADJUSTED_CLEANED.shape[0])

In [9]:
MEAN_ADJUSTED = numpyAnalysis(ADJUSTED_CLEANED)

## Quantile normalisation

In [10]:
def sortDist(d):
    sortedd = [(v, i) for i, v in enumerate(d)]
    sortedd.sort()
    return sortedd

def avgDist(*args):
    toReturn = []
    for tuplas in zip(*args):
        toAdd = float(0)
        for v, _ in tuplas:
            toAdd += v
        toAdd /= len(tuplas)
        toReturn.append(toAdd)
    return toReturn

def quantileNormalise(*args):
    args = [sortDist(d) for d in args]
    avgd = avgDist(*args)
    toReturn = []
    for dist in args:
        normDist = [(i, a) for a, (v, i) in zip(avgd, dist)]
        normDist.sort()
        normDist = [j for (i, j) in normDist]
        toReturn.append(normDist)
    return toReturn

d1 = [10, 9, 11, 23]
d2 = [4, 6, 7, 5]

assert(quantileNormalise(d1, d2) == [[7.5, 6.5, 8.5, 15], [6.5, 8.5, 15, 7.5]])

distributions = []

for i in range(CLEANED.shape[1]):
    distributions.append(CLEANED[...,i])

QUANT_NORM = quantileNormalise(*distributions)

In [11]:
cleanedQuant = np.zeros_like(CLEANED)
for i, dist in enumerate(QUANT_NORM):
    for j, value in enumerate(dist):
        cleanedQuant[j][i] = value

In [12]:
QUANT_ADJUSTED = numpyAnalysis(cleanedQuant)

# Gene significance using Benjamini-Hochberg

## first, load transcription cluster ids from `cleaned`

In [13]:
ADJUSTED_CLEANED, ADJUSTED_RESIDUALS, ADJUSTED_RESIDUALS_PVALUES, ADJUSTED_MODAL_ALLELE, ADJUSTED_ISEs, ADJUSTED_GENE_PVALUES = MEAN_ADJUSTED

In [14]:
def computeBenjhoch(ADJUSTED):
    benjhoch = list(zip(ADJUSTED[5], T_CLUSTER_IDS))
    benjhoch.sort()

    m = len(benjhoch)
    k = range(1, len(benjhoch) + 1)
    sortedGenes = [((a[0] * m)/(b), a[1]) for a, b in zip(benjhoch, k)]
    currentMin = sortedGenes[-1][0]
    processedGenesRev = []
    for (FDR, gene) in sortedGenes[::-1]:
        if FDR < currentMin:
            currentMin = FDR
        processedGenesRev.append((currentMin, gene))
    processedGenes = processedGenesRev[::-1]
    return processedGenes

def kSig(ADJUSTED, k):
    return [(i + 1, h, j) for (i, (j, h)) in enumerate(computeBenjhoch(ADJUSTED)[:k])]

In [15]:
def computeP(ADJUSTED):
    p = list(zip(ADJUSTED[5], T_CLUSTER_IDS))
    p.sort()
    return p

## Or give a list of transcription clusters with FDR of 50%

In [16]:
def genesForAlpha(processedGenes, alpha):
    lista = []
    for i, (FDR, ida) in enumerate(processedGenes):
        if FDR < alpha:
            lista.append((FDR, ida))
        else:
            break
    return lista

ALPHA = 0.001
sigGenes = genesForAlpha(computeBenjhoch(QUANT_ADJUSTED), ALPHA)
noSigGenes = len(sigGenes)
print("number of genes at alpha", ALPHA, "is", noSigGenes)
len(sigGenes)

number of genes at alpha 0.001 is 0


0

# Work out the mapping from transcription clusters to gene names

In [17]:
probeset_geneid = {}
def unquote(s):
    if len(s) < 2:
        return s
    else:
        if s[0] == '"' and s[-1] == '"':
            return s[1:-1]
    return s
with open("pathway_enrichment/annotated_probesets.csv") as f:
    for i, line in enumerate(f):
        if i != 0:
            line = line.strip().split("\t")
            probeset_geneid[line[1]] = line[0]
trans_gene_id = {}
counter = 0
with open("pathway_enrichment/HuEx-1_0-st-v2.na36.hg19.probeset.csv") as f:
    for i, line in enumerate(f):
        if i < 24:
            pass
        else:
            line = line.rstrip().split(",")
            line = [unquote(i) for i in line]
            probeset_id, transcription_id = line[0], line[6]
            try:
                trans_gene_id[transcription_id] = probeset_geneid[probeset_id]
            except:
                counter += 1

In [18]:
trans_gene_id

{'2315125': 'OR4F5',
 '2315251': 'OR4F29',
 '2315554': 'TTLL10',
 '2315633': 'B3GALT6',
 '2315674': 'SCNN1D',
 '2315739': 'PUSL1',
 '2315773': 'CPTP',
 '2315786': 'TAS1R3',
 '2315880': 'TMEM88B',
 '2315894': 'VWA1',
 '2315918': 'ATAD3C',
 '2315942': 'ATAD3C',
 '2315951': 'ATAD3A',
 '2316218': 'CALML6',
 '2316245': 'PRKCZ',
 '2316343': 'AL590822.2',
 '2316345': 'AL590822.2',
 '2316347': 'AL590822.2',
 '2316350': 'AL590822.2',
 '2316379': 'SKI',
 '2316558': 'RER1',
 '2316605': 'PLCH2',
 '2316746': 'FAM213B',
 '2316905': 'ACTRT2',
 '2316953': 'PRDM16',
 '2317246': 'ARHGEF16',
 '2317317': 'TP73',
 '2317434': 'TPRG1L',
 '2317472': 'CCDC27',
 '2317498': 'SMIM1',
 '2317512': 'DFFB',
 '2317686': 'AJAP1',
 '2318086': 'KCNAB2',
 '2318132': 'KCNAB2',
 '2318157': 'RNF207',
 '2318170': 'RNF207',
 '2318220': 'HES3',
 '2318257': 'ESPN',
 '2318338': 'TAS1R1',
 '2318364': 'ZBTB48',
 '2318398': 'PHF13',
 '2318416': 'THAP3',
 '2318455': 'CAMTA1',
 '2318637': 'VAMP3',
 '2318656': 'PER3',
 '2318736': 'PARK

# Gene list without DSE

In [19]:
len(sigGenes)

0

In [20]:
sigGeneNoDSE = [(i, trans_gene_id[j]) for (i, j) in sigGenes if j in trans_gene_id]

In [21]:
len(sigGeneNoDSE)

0

# Disease Specific Effects

## Let us partition k most significant genes and adjust them for DSE to check if it helps improve significance.

In [22]:
import math
def computeDSE(CLEANED, RESIDUALS, genesA, genesB, genesRest):
    #print("hi", CLEANED.shape)
    DSE = [0] * CLEANED.shape[1]
    relevantGenesAIndex = [MAP_T_CLUSTER_INDEX[t_cluster_id] for _, t_cluster_id in genesA]
    relevantGenesBIndex = [MAP_T_CLUSTER_INDEX[t_cluster_id] for _, t_cluster_id in genesB]
    relevantGenesRestIndex = [MAP_T_CLUSTER_INDEX[t_cluster_id] for _, t_cluster_id in genesRest]

    for i in range(CLEANED.shape[1]):
        relevantResiduals = [RESIDUALS[j,i] for j in relevantGenesAIndex]
        DSE[i] = sum(relevantResiduals)/len(relevantResiduals)

    # copy CLEANED into another array
    DSE_ADJUSTED = np.copy(CLEANED)

    # adjust genes in set B with DSEs computed from genes in set A
    #print(len(relevantGenesBIndex + relevantGenesRestIndex))
    for i in relevantGenesBIndex + relevantGenesRestIndex:
        for j in range(CLEANED.shape[1]):
            DSE_ADJUSTED[i][j] -= DSE[j]
    return DSE_ADJUSTED, relevantGenesBIndex

def computeReverseDSE(CLEANED, RESIDUALS, genesA, genesB):
    DSE = [0] * CLEANED.shape[1]
    relevantGenesAIndex = [MAP_T_CLUSTER_INDEX[t_cluster_id] for _, t_cluster_id in genesA]
    relevantGenesBIndex = [MAP_T_CLUSTER_INDEX[t_cluster_id] for _, t_cluster_id in genesB]

    for i in range(CLEANED.shape[1]):
        relevantResiduals = [RESIDUALS[j,i] for j in relevantGenesAIndex]
        DSE[i] = sum(relevantResiduals)/len(relevantResiduals)

    # copy CLEANED into another array
    DSE_ADJUSTED = np.copy(CLEANED)

    # adjust genes in set B with DSEs computed from genes in set A
    for i in relevantGenesBIndex:
        for j in range(CLEANED.shape[1]):
            DSE_ADJUSTED[i][j] += DSE[j]
    return DSE_ADJUSTED, relevantGenesBIndex

def fractionImprovedPValues(GENE_PVALUES, GENE_PVALUES_B, relevantGenesBIndex):
    previousPValues = [GENE_PVALUES[i] for i in relevantGenesBIndex]
    currentPValues = [GENE_PVALUES_B[i] for i in relevantGenesBIndex]
    countSmaller = 0
    #print(previousPValues, currentPValues)
    for i, _ in enumerate(relevantGenesBIndex):
        if (currentPValues[i] < previousPValues[i]):
            countSmaller += 1
    return countSmaller/len(currentPValues)

def howMuchImprovedPValues(GENE_PVALUES, GENE_PVALUES_B, relevantGenesBIndex):
    previousPValues = [GENE_PVALUES[i] for i in relevantGenesBIndex]
    currentPValues = [GENE_PVALUES_B[i] for i in relevantGenesBIndex]
    countSmaller = 0
    improvements = []
    worsens = []
    for i, _ in enumerate(relevantGenesBIndex):
        if previousPValues[i] > currentPValues[i]:
            improvement = previousPValues[i] / currentPValues[i]
            improvements.append(improvement)
        else:
            worsen = currentPValues[i] / previousPValues[i]
            worsens.append(worsen)
    return sum(improvements)/len(improvements), sum(worsens)/len(improvements)
    

def check_DSE(x, DATASET, reverse = False):
    compute = computeDSE
    if reverse:
        compute = computeReverseDSE
    genes = list(zip(DATASET[5], T_CLUSTER_IDS))
    genes.sort()
    genesA = genes[:x//2]
    genesB = genes[x//2:x]
    genesRest = genes[x:]
    DSE_ADJUSTED, relevantGenesBIndex = compute(DATASET[0], DATASET[1], genesA, genesB, genesRest)
    ADJUSTED_DATASET = numpyAnalysis(DSE_ADJUSTED)
    fractionImproved = fractionImprovedPValues(DATASET[5], ADJUSTED_DATASET[5], relevantGenesBIndex)
    averageImprovement = howMuchImprovedPValues(DATASET[5], ADJUSTED_DATASET[5], relevantGenesBIndex)
    print("average improvement is {}".format(averageImprovement))
    return fractionImproved

In [23]:
def give_adjusted_dataset(x, DATASET):
    genes = list(zip(DATASET[5], T_CLUSTER_IDS))
    genes.sort()
    print(genes[:10])
    genesA = genes[:x//2]
    genesB = genes[x//2:x]
    genesRest = genes[x:]
    DSE_ADJUSTED, relevantGenesBIndex = computeDSE(DATASET[0], DATASET[1], genesA, genesB, genesRest)
    ADJUSTED_DATASET = numpyAnalysis(DSE_ADJUSTED)
    return ADJUSTED_DATASET, relevantGenesBIndex

In [24]:
def give_sig_genes(x, DATASET):
    genes = list(zip(DATASET[5], T_CLUSTER_IDS))
    genes.sort()
    genesA = genes[:x//2]
    genesB = genes[x//2:x]
    genesRest = genes[x:]
    #sig_Genes1 = genesForAlpha(computeBenjhoch(DATASET), ALPHA)
    DSE_ADJUSTED, relevantGenesBIndex = computeDSE(DATASET[0], DATASET[1], genesA, genesB, genesRest)
    ADJUSTED_DATASET = numpyAnalysis(DSE_ADJUSTED)
    sigGenes2 = genesForAlpha(computeBenjhoch(ADJUSTED_DATASET), ALPHA)
    noSigGenes = len(sigGenes2)
    print("number of genes at alpha", ALPHA, "is", noSigGenes)
    return [(i, trans_gene_id[j]) for (i, j) in sigGenes2 if j in trans_gene_id]

In [31]:
check_DSE(256, QUANT_ADJUSTED)

average improvement is (1.1145158940089153, 1.7068966443621865)


0.3984375

In [32]:
sigGeneDSE = give_sig_genes(256, QUANT_ADJUSTED)

number of genes at alpha 0.001 is 0


In [33]:
print(len(sigGeneDSE))

0


In [34]:
print(len(sigGeneNoDSE))

0


# Inclusions

In [35]:
noDSE = {k[1] for k in sigGeneNoDSE}

In [36]:
DSE = {k[1] for k in sigGeneDSE}

# Counts before and after ISE

In [37]:
DSE_ADJUSTED, genes_B_index = give_adjusted_dataset(256, QUANT_ADJUSTED)

[(2.667856238100745e-06, 'id184036'), (2.8606234700480865e-06, 'id145342'), (6.3174190960069496e-06, 'id116656'), (6.7044674579257417e-06, 'id126558'), (1.1890313511122048e-05, 'id134352'), (1.4575827150118246e-05, 'id167716'), (1.6693168706569494e-05, 'id152771'), (1.9348387293242356e-05, 'id190020'), (2.068402946696098e-05, 'id48541'), (2.4320226816270424e-05, 'id143594')]


In [38]:
DSE_good_i = []
for i, p in enumerate(DSE_ADJUSTED[5]):
    if p < 0.000000183:
        DSE_good_i.append(i)

In [39]:
QUANT_good_i = []
for i, p in enumerate(QUANT_ADJUSTED[5]):
    if p < 0.000000183:
        QUANT_good_i.append(i)

In [40]:
len(QUANT_good_i)

0

In [41]:
len(set(DSE_good_i).intersection(set(QUANT_good_i)))

0

In [42]:
trans_gene_id

{'2315125': 'OR4F5',
 '2315251': 'OR4F29',
 '2315554': 'TTLL10',
 '2315633': 'B3GALT6',
 '2315674': 'SCNN1D',
 '2315739': 'PUSL1',
 '2315773': 'CPTP',
 '2315786': 'TAS1R3',
 '2315880': 'TMEM88B',
 '2315894': 'VWA1',
 '2315918': 'ATAD3C',
 '2315942': 'ATAD3C',
 '2315951': 'ATAD3A',
 '2316218': 'CALML6',
 '2316245': 'PRKCZ',
 '2316343': 'AL590822.2',
 '2316345': 'AL590822.2',
 '2316347': 'AL590822.2',
 '2316350': 'AL590822.2',
 '2316379': 'SKI',
 '2316558': 'RER1',
 '2316605': 'PLCH2',
 '2316746': 'FAM213B',
 '2316905': 'ACTRT2',
 '2316953': 'PRDM16',
 '2317246': 'ARHGEF16',
 '2317317': 'TP73',
 '2317434': 'TPRG1L',
 '2317472': 'CCDC27',
 '2317498': 'SMIM1',
 '2317512': 'DFFB',
 '2317686': 'AJAP1',
 '2318086': 'KCNAB2',
 '2318132': 'KCNAB2',
 '2318157': 'RNF207',
 '2318170': 'RNF207',
 '2318220': 'HES3',
 '2318257': 'ESPN',
 '2318338': 'TAS1R1',
 '2318364': 'ZBTB48',
 '2318398': 'PHF13',
 '2318416': 'THAP3',
 '2318455': 'CAMTA1',
 '2318637': 'VAMP3',
 '2318656': 'PER3',
 '2318736': 'PARK

In [43]:
DSE_translated_result = []
for i in DSE_good_i:
    gene_id = T_CLUSTER_IDS[i]
    if gene_id in trans_gene_id:
        DSE_translated_result.append(trans_gene_id[gene_id])

In [44]:
QUANT_translated_result = []
for i in QUANT_good_i:
    gene_id = T_CLUSTER_IDS[i]
    if gene_id in trans_gene_id:
        QUANT_translated_result.append(trans_gene_id[gene_id])

In [45]:
len(QUANT_translated_result)

0

In [46]:
for i in QUANT_translated_result:
    print(i)

In [47]:
len(DSE_translated_result)

0

In [48]:
for i in DSE_translated_result:
    print(i)

In [49]:
def geneNamesatAlpha(experiment, alpha):
    genes = genesForAlpha(computeBenjhoch(experiment), alpha)
    return [(i, trans_gene_id[j]) for (i, j) in genes if j in trans_gene_id]

In [50]:
def debugGeneNamesatAlpha(experiment, alpha):
    genes = genesForAlpha(computeBenjhoch(experiment), alpha)
    return genes

In [51]:
thresholds = [0.000003, 0.001, 0.01]

In [52]:
fractionImproved = fractionImprovedPValues(QUANT_ADJUSTED[5], DSE_ADJUSTED[5], genes_B_index)

In [53]:
fractionImproved

0.3984375

In [54]:
v = []
for alpha in thresholds:
    v.append(geneNamesatAlpha(QUANT_ADJUSTED, alpha))

In [55]:
vv = []
for alpha in thresholds:
    vv.append(debugGeneNamesatAlpha(QUANT_ADJUSTED, alpha))

In [56]:
[len(i) for i in v]

[0, 0, 0]

In [57]:
w = []
for alpha in thresholds:
    w.append(geneNamesatAlpha(DSE_ADJUSTED, alpha))

In [58]:
ww = []
for alpha in thresholds:
    ww.append(geneNamesatAlpha(DSE_ADJUSTED, alpha))

In [59]:
[len(i) for i in w]

[0, 0, 0]

In [60]:
t = 0.00000020

In [61]:
len(sorted([i for i in DSE_ADJUSTED[5] if i <= t]))

0

In [62]:
len(sorted([i for i in QUANT_ADJUSTED[5] if i <= t]))

0

In [63]:
noDSE.difference(DSE)

set()

In [64]:
DSE.difference(noDSE)

set()

In [65]:
import random
def check_DSE_random(x, DATASET):
    genes = list(zip(DATASET[5], T_CLUSTER_IDS))
    genes.sort()
    randomFudge = genes[:]
    random.shuffle(randomFudge)
    genesA = randomFudge[:x//2]
    genesB = genes[x//2:x]
    genesBSet = set([i[1] for i in genesB])
    genesASet = ([i[1] for i in genesA])
    restGenes = []
    for gene in genes:
        if gene[1] not in genesASet and gene[1] not in genesBSet:
            restGenes.append(gene)
    DSE_ADJUSTED, relevantGenesBIndex = computeDSE(DATASET[0], DATASET[1], genesA, genesB, restGenes)
    ADJUSTED_DATASET = numpyAnalysis(DSE_ADJUSTED)
    fractionImproved = fractionImprovedPValues(DATASET[5], ADJUSTED_DATASET[5], relevantGenesBIndex)
    return fractionImproved

In [66]:
experiments = [2**i for i in range(5, 15)] + [len(QUANT_ADJUSTED[5])]
experiments

[32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 200000]

In [67]:
results = {}
for i in experiments:
    results[i] = check_DSE(i, QUANT_ADJUSTED)

average improvement is (1.4526288646522039, 6.7430277405252887)
average improvement is (1.1707071678650995, 2.5979076852482592)
average improvement is (1.1157234767508974, 3.0877317370651109)
average improvement is (1.1145158940089153, 1.7068966443621865)
average improvement is (1.0838467892710149, 1.5260436740727525)
average improvement is (1.0457491037479094, 1.2264789149016799)
average improvement is (1.0316725007328431, 1.136693169173091)
average improvement is (1.0196547087623713, 1.1666379626035177)
average improvement is (1.011960104533886, 1.0935584145448596)
average improvement is (1.0057099697901637, 1.0346843672022064)
average improvement is (1.0001139389354752, 1.0251476213197654)


In [68]:
results

{32: 0.25,
 64: 0.34375,
 128: 0.28125,
 256: 0.3984375,
 512: 0.41796875,
 1024: 0.4609375,
 2048: 0.4765625,
 4096: 0.46630859375,
 8192: 0.480712890625,
 16384: 0.492919921875,
 200000: 0.49382}

In [70]:
import time
start = time.time()
resultsRandom = {}
testRange = experiments
for i in testRange:
    resultsRandom[i] = []
for j in range(500):
    print("tick")
    for i in testRange:
        res = check_DSE_random(i, QUANT_ADJUSTED)
        resultsRandom[i].append(res)
stop = time.time()

tick


KeyboardInterrupt: 

In [None]:
print(stop - start)

In [None]:
resultsRandom

In [None]:
stop - start

# For ALL check which batch the data comes from:

In [None]:
index_to_batch = []
if CEL_LOCATION.split("/")[-1] == "ALL":
    for file in CELFILES:
        index_to_batch.append(file.split(".")[0].split("_")[-1][0])

# Write results to a file

In [None]:
import json
with open(CEL_LOCATION + "/results", "w") as f:
    json.dump({
        "QA": computeBenjhoch(QUANT_ADJUSTED),
        "QA_raw": [list(key) for key in np.transpose(QUANT_ADJUSTED[0])],
        "MODAL_ALLELE": MODAL_ALLELE,
        "GENDER" : SEX,
        "labels": index_to_batch,
        "QA_P" : computeP(QUANT_ADJUSTED),
        "NON_ADJUSTED": computeBenjhoch(NON_ADJUSTED),
        "NON_ADJUSTED_P": computeP(NON_ADJUSTED),
        "MEAN_ADJUSTED": computeBenjhoch(MEAN_ADJUSTED),
        "MEAN_ADJUSTED_P": computeP(MEAN_ADJUSTED),
        "results" : results,
        "resultsRandom": resultsRandom
    }, f)