# Analysis of microarry data from patients with Sepsis vs SIRS.

# Pre-process & RMA Normalise data

In [3]:
import os
import os.path
import sys
sys.argv = ["", "CEL_files"]
CEL_LOCATION = sys.argv[1]

CELFILES = os.listdir(CEL_LOCATION)

CELFILES = [file for file in CELFILES if file.split(".")[-1].lower() == "cel"]

print(CELFILES)

['GSM1914873_A0056073_2.CEL', 'GSM1914889_A0056111.CEL', 'GSM1914851_A0056373.CEL', 'GSM1914855_A0056477.CEL', 'GSM1914853_A0056306.CEL', 'GSM1914819_A0056303.CEL', 'GSM1914885_A0056002.CEL', 'GSM1914844_A0057655.CEL', 'GSM1914897_A0055990.CEL', 'GSM1914818_A0056371.CEL', 'GSM1914817_A0062016.CEL', 'GSM1914815_A0057640.CEL', 'GSM1914878_A0056058.CEL', 'GSM1914826_A0056396.CEL', 'GSM1914832_A0056315.CEL', 'GSM1914906_A0056080.CEL', 'GSM1914808_A0057637.CEL', 'GSM1914890_A0055957.CEL', 'GSM1914836_A0062102.CEL', 'GSM1914903_A0056151.CEL', 'GSM1914880_A0056040.CEL', 'GSM1914895_A0056010.CEL', 'GSM1914814_A0056275.CEL', 'GSM1914911_A0056069.CEL', 'GSM1914866_A0056046.CEL', 'GSM1914835_A0062096.CEL', 'GSM1914870_A0056035.CEL', 'GSM1914879_A0056195.CEL', 'GSM1914820_A0056285.CEL', 'GSM1914860_A0056430.CEL', 'GSM1914841_A0057656.CEL', 'GSM1914827_A0062054.CEL', 'GSM1914886_A0055959.CEL', 'GSM1914865_A0056051.CEL', 'GSM1914816_A0062011.CEL', 'GSM1914834_A0056341.CEL', 'GSM1914894_A0056013.CEL'

In [4]:
# Set up matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt

#%matplotlib inline

In [5]:
# Carry out RMA normalisation on all CEL files. Write output to core file.
# Caution: RMA data is NOT quantile-normalised.
# This step takes a few minutes (unless core file already exists).

import utils
corefile =  os.path.join(CEL_LOCATION, "core")
try:
    with open(corefile, "rb") as f:
        print("core file exists")
except FileNotFoundError:
    utils.executeNotebook("normalise.ipynb", CEL_LOCATION)
    print("core file created")

core file exists


In [6]:
ID_TO_REPEAT = {}
ID_TO_SEX = {}
with open("CEL_files/features.txt") as f:
    for line in f:
        line = line.rstrip()
        file_name, status = line.split("\t")
        if any([file_name in actual_file_name for actual_file_name in CELFILES]):
            status, sample_id = status.split("_")
            unique_id = status + sample_id
            integer = 0
            if status == "Sepsis":
                integer = 0
            elif status == "SIRS":
                integer = 1
            else:
                raise ValueError("wtf?")
            ID_TO_REPEAT[file_name] = integer
            ID_TO_SEX[file_name] = "M"
        else:
            print("where is", file_name)

In [7]:
ID_TO_REPEAT

{'GSM1914807': 0,
 'GSM1914808': 0,
 'GSM1914809': 0,
 'GSM1914810': 0,
 'GSM1914811': 0,
 'GSM1914812': 0,
 'GSM1914813': 0,
 'GSM1914814': 0,
 'GSM1914815': 0,
 'GSM1914816': 0,
 'GSM1914817': 0,
 'GSM1914818': 0,
 'GSM1914819': 0,
 'GSM1914820': 0,
 'GSM1914821': 0,
 'GSM1914822': 0,
 'GSM1914823': 0,
 'GSM1914824': 0,
 'GSM1914825': 0,
 'GSM1914826': 0,
 'GSM1914827': 0,
 'GSM1914828': 0,
 'GSM1914829': 0,
 'GSM1914830': 0,
 'GSM1914831': 0,
 'GSM1914832': 0,
 'GSM1914833': 0,
 'GSM1914834': 0,
 'GSM1914835': 0,
 'GSM1914836': 0,
 'GSM1914837': 0,
 'GSM1914838': 0,
 'GSM1914839': 0,
 'GSM1914840': 0,
 'GSM1914841': 0,
 'GSM1914842': 0,
 'GSM1914843': 0,
 'GSM1914844': 0,
 'GSM1914845': 0,
 'GSM1914846': 0,
 'GSM1914847': 0,
 'GSM1914848': 0,
 'GSM1914849': 0,
 'GSM1914850': 0,
 'GSM1914851': 0,
 'GSM1914852': 0,
 'GSM1914853': 0,
 'GSM1914854': 0,
 'GSM1914855': 0,
 'GSM1914856': 0,
 'GSM1914857': 0,
 'GSM1914858': 0,
 'GSM1914859': 0,
 'GSM1914860': 0,
 'GSM1914861': 0,
 'GSM19148

In [8]:
ID_TO_COLUMN = {}
with open(corefile, "r") as file:
    for line in file:
        for column, person in enumerate(line.rstrip().split("\t")):
            if person:
                ID_TO_COLUMN[person.split("_")[0]] = column 
        break

In [9]:
ID_TO_COLUMN

{'GSM1914807': 1,
 'GSM1914808': 2,
 'GSM1914809': 3,
 'GSM1914810': 4,
 'GSM1914811': 5,
 'GSM1914812': 6,
 'GSM1914813': 7,
 'GSM1914814': 8,
 'GSM1914815': 9,
 'GSM1914816': 10,
 'GSM1914817': 11,
 'GSM1914818': 12,
 'GSM1914819': 13,
 'GSM1914820': 14,
 'GSM1914821': 15,
 'GSM1914822': 16,
 'GSM1914823': 17,
 'GSM1914824': 18,
 'GSM1914825': 19,
 'GSM1914826': 20,
 'GSM1914827': 21,
 'GSM1914828': 22,
 'GSM1914829': 23,
 'GSM1914830': 24,
 'GSM1914831': 25,
 'GSM1914832': 26,
 'GSM1914833': 27,
 'GSM1914834': 28,
 'GSM1914835': 29,
 'GSM1914836': 30,
 'GSM1914837': 31,
 'GSM1914838': 32,
 'GSM1914839': 33,
 'GSM1914840': 34,
 'GSM1914841': 35,
 'GSM1914842': 36,
 'GSM1914843': 37,
 'GSM1914844': 38,
 'GSM1914845': 39,
 'GSM1914846': 40,
 'GSM1914847': 41,
 'GSM1914848': 42,
 'GSM1914849': 43,
 'GSM1914850': 44,
 'GSM1914851': 45,
 'GSM1914852': 46,
 'GSM1914853': 47,
 'GSM1914854': 48,
 'GSM1914855': 49,
 'GSM1914856': 50,
 'GSM1914857': 51,
 'GSM1914858': 52,
 'GSM1914859': 53,
 '

In [10]:
with open(CEL_LOCATION + "/header", "w") as f:
    f.write("IDs")
    for ida in ID_TO_COLUMN:
        f.write("\t")
        f.write(ida)
    f.write("\n")
    f.write("ModalAllele")
    for ida in ID_TO_COLUMN:
        f.write("\t")
        f.write(str(ID_TO_REPEAT[ida]))
    f.write("\n")
    f.write("GENDER")
    for ida in ID_TO_COLUMN:
        f.write("\t")
        f.write(ID_TO_SEX[ida])
    f.write("\n")
    f.write("Affymetrix\n")

In [11]:
import subprocess
#with open("cleaned", "wb") as f:
subprocess.run("tail -n +2 " + CEL_LOCATION + "/core  | cat " + CEL_LOCATION + "/header - > " + CEL_LOCATION + "/cleaned", shell=True)

CompletedProcess(args='tail -n +2 CEL_files/core  | cat CEL_files/header - > CEL_files/cleaned', returncode=0)

## pre-processing & normalisation finished. File `cleaned` written to hard drive in current working directory.

# Analyse the data using Numpy

1. Fit Modal allele length to each gene using ordinary linear regression.
2. Expose the primary dataset as `CLEANED` residuals as `RESIDUALS`, Existence of Individuals Specific Effects as `RESIDUALS_PVALUES`, and Individual Specific Effects as `ISEs` and Modal Allele length as `MODAL_ALLELE`.

In [12]:
# Load file `cleaned` back into python

import numpy as np
import scipy.stats

filename = CEL_LOCATION + "/cleaned"
with open(filename) as f:
    rowID = None
    skipCount = 0
    while rowID != "Affymetrix":
        line = f.readline().rstrip().split()
        skipCount += 1
        rowID = line[0]
        if rowID == "IDs":
            columns = range(len(line))[1:]
            ids = line[1:]
        if rowID == "ModalAllele":
            MODAL_ALLELE = [int(i) for i in line[1:]]
        if rowID == "GENDER":
            SEX = [i for i in line[1:]]

T_CLUSTER_IDS = []

MAP_T_CLUSTER_INDEX = {}

read = False
with open(CEL_LOCATION + "/cleaned") as f:
    lines = f.readlines()
    i = 0
    for line in lines:
        line = line.rstrip().split("\t")
        if read:
            T_CLUSTER_IDS.append(line[0])
            MAP_T_CLUSTER_INDEX[line[0]] = i
            i += 1
        if line:
            if line[0] == "Affymetrix":
                read = True
with open(filename) as f:
    CLEANED = np.loadtxt(f, delimiter="\t", skiprows=skipCount, usecols=columns)

In [13]:
T_CLUSTER_IDS

['2315554',
 '2315633',
 '2315674',
 '2315739',
 '2315894',
 '2315918',
 '2315951',
 '2316218',
 '2316245',
 '2316379',
 '2316558',
 '2316605',
 '2316746',
 '2316905',
 '2316953',
 '2317246',
 '2317317',
 '2317434',
 '2317472',
 '2317512',
 '2317686',
 '2318086',
 '2318157',
 '2318170',
 '2318212',
 '2318242',
 '2318257',
 '2318338',
 '2318364',
 '2318398',
 '2318416',
 '2318455',
 '2318637',
 '2318656',
 '2318736',
 '2318743',
 '2318744',
 '2318746',
 '2318747',
 '2318751',
 '2318754',
 '2318755',
 '2318759',
 '2318761',
 '2318763',
 '2318765',
 '2318767',
 '2318769',
 '2318771',
 '2318773',
 '2318775',
 '2318777',
 '2318779',
 '2318781',
 '2318783',
 '2318785',
 '2318787',
 '2318789',
 '2318791',
 '2318793',
 '2318795',
 '2318797',
 '2318799',
 '2318801',
 '2318803',
 '2318805',
 '2318807',
 '2318809',
 '2318811',
 '2318813',
 '2318815',
 '2318817',
 '2318819',
 '2318821',
 '2318823',
 '2318825',
 '2318827',
 '2318829',
 '2318831',
 '2318833',
 '2318835',
 '2318837',
 '2318839',
 '23

In [14]:
def numpyAnalysis(CLEANED):
    RESIDUALS = np.zeros_like(CLEANED)

    rowNo, columnNo = CLEANED.shape

    GENE_PVALUES = []

    for i in range(rowNo):
        slope, intercept, r_value, p_value, _ = scipy.stats.linregress(MODAL_ALLELE, list(CLEANED[i]))
        GENE_PVALUES.append(p_value)
        expected = [slope * value + intercept for value in MODAL_ALLELE]
        residuals = [actual - expected for actual, expected in zip(CLEANED[i], expected)]
        for j, residual in enumerate(residuals):
            RESIDUALS[i][j] = residual

    RESIDUALS_PVALUES = []
    ISEs = []
    for i in range(columnNo):
        p_value = scipy.stats.ttest_1samp(RESIDUALS[..., i], 0)[1]
        ISEs.append(sum(RESIDUALS[..., i])/rowNo)
        RESIDUALS_PVALUES.append(p_value)
    return (CLEANED, RESIDUALS, RESIDUALS_PVALUES, MODAL_ALLELE, ISEs, GENE_PVALUES)

In [15]:
NON_ADJUSTED = numpyAnalysis(CLEANED)

CLEANED, RESIDUALS, RESIDUALS_PVALUES, MODAL_ALLELE, ISEs, GENE_PVALUES = NON_ADJUSTED

_ = CLEANED # RMA normalised Affymetrix data
_ = RESIDUALS # Residuals computed from MODAL_ALLELE
_ = RESIDUALS_PVALUES # Existence of Individual Specific Effects (ISEs).
_ = MODAL_ALLELE # Modal allele length
_ = ISEs # Individual Specific Effects.
_ = GENE_PVALUES # P-value for each gene

## Mean-normalisation.

In [16]:
# First, compute MEAN_GENE_ADJUSTMENTS

MEAN_GEL = [] # mean gene expression levels, per individual.

for i in range(CLEANED.shape[1]):
    MEAN_GEL.append(sum(CLEANED[...,i])/CLEANED.shape[0])
MEAN_MEAN_GEL = sum(MEAN_GEL)/len(MEAN_GEL)

MEAN_GENE_ADJUSTMENT = []

for k in range(len(MEAN_GEL)):
    MEAN_GENE_ADJUSTMENT.append(MEAN_GEL[k] - MEAN_MEAN_GEL)

print(MEAN_GEL[:5])

[7.0978029573885975, 7.1223244702828481, 7.129354537327238, 7.111849023028542, 7.1544707793368518]


In [17]:
# Then, compute ADJUSTED_CLEANED

ADJUSTED_CLEANED = np.copy(CLEANED)
for i in range(CLEANED.shape[0]):
    for j in range(CLEANED.shape[1]):
        ADJUSTED_CLEANED[i][j] -= MEAN_GENE_ADJUSTMENT[j]

adjusted_means = []

for j in range(ADJUSTED_CLEANED.shape[1]):
    adjusted_means.append(sum(ADJUSTED_CLEANED[:,j])/ADJUSTED_CLEANED.shape[0])

In [18]:
MEAN_ADJUSTED = numpyAnalysis(ADJUSTED_CLEANED)

## Quantile normalisation

In [19]:
def sortDist(d):
    sortedd = [(v, i) for i, v in enumerate(d)]
    sortedd.sort()
    return sortedd

def avgDist(*args):
    toReturn = []
    for tuplas in zip(*args):
        toAdd = float(0)
        for v, _ in tuplas:
            toAdd += v
        toAdd /= len(tuplas)
        toReturn.append(toAdd)
    return toReturn

def quantileNormalise(*args):
    args = [sortDist(d) for d in args]
    avgd = avgDist(*args)
    toReturn = []
    for dist in args:
        normDist = [(i, a) for a, (v, i) in zip(avgd, dist)]
        normDist.sort()
        normDist = [j for (i, j) in normDist]
        toReturn.append(normDist)
    return toReturn

d1 = [10, 9, 11, 23]
d2 = [4, 6, 7, 5]

assert(quantileNormalise(d1, d2) == [[7.5, 6.5, 8.5, 15], [6.5, 8.5, 15, 7.5]])

distributions = []

for i in range(CLEANED.shape[1]):
    distributions.append(CLEANED[...,i])

QUANT_NORM = quantileNormalise(*distributions)

In [20]:
cleanedQuant = np.zeros_like(CLEANED)
for i, dist in enumerate(QUANT_NORM):
    for j, value in enumerate(dist):
        cleanedQuant[j][i] = value

In [21]:
QUANT_ADJUSTED = numpyAnalysis(cleanedQuant)

# Gene significance using Benjamini-Hochberg

## first, load transcription cluster ids from `cleaned`

In [22]:
ADJUSTED_CLEANED, ADJUSTED_RESIDUALS, ADJUSTED_RESIDUALS_PVALUES, ADJUSTED_MODAL_ALLELE, ADJUSTED_ISEs, ADJUSTED_GENE_PVALUES = MEAN_ADJUSTED

In [23]:
def computeBenjhoch(ADJUSTED):
    benjhoch = list(zip(ADJUSTED[5], T_CLUSTER_IDS))
    benjhoch.sort()

    m = len(benjhoch)
    k = range(1, len(benjhoch) + 1)
    sortedGenes = [((a[0] * m)/(b), a[1]) for a, b in zip(benjhoch, k)]
    currentMin = sortedGenes[-1][0]
    processedGenesRev = []
    for (FDR, gene) in sortedGenes[::-1]:
        if FDR < currentMin:
            currentMin = FDR
        processedGenesRev.append((currentMin, gene))
    processedGenes = processedGenesRev[::-1]
    return processedGenes

def kSig(ADJUSTED, k):
    return [(i + 1, h, j) for (i, (j, h)) in enumerate(computeBenjhoch(ADJUSTED)[:k])]

In [24]:
def computeP(ADJUSTED):
    p = list(zip(ADJUSTED[5], T_CLUSTER_IDS))
    p.sort()
    return p

## Or give a list of transcription clusters with FDR of 50%

In [25]:
def genesForAlpha(processedGenes, alpha):
    lista = []
    for i, (FDR, ida) in enumerate(processedGenes):
        if FDR < alpha:
            lista.append((FDR, ida))
        else:
            break
    return lista

ALPHA = 0.001
sigGenes = genesForAlpha(computeBenjhoch(QUANT_ADJUSTED), ALPHA)
noSigGenes = len(sigGenes)
print("number of genes at alpha", ALPHA, "is", noSigGenes)
len(sigGenes)

number of genes at alpha 0.001 is 1795


1795

# Work out the mapping from transcription clusters to gene names

In [26]:
probeset_geneid = {}
def unquote(s):
    if len(s) < 2:
        return s
    else:
        if s[0] == '"' and s[-1] == '"':
            return s[1:-1]
    return s
with open("pathway_enrichment/annotated_probesets.csv") as f:
    for i, line in enumerate(f):
        if i != 0:
            line = line.strip().split("\t")
            probeset_geneid[line[1]] = line[0]
trans_gene_id = {}
counter = 0
with open("pathway_enrichment/HuEx-1_0-st-v2.na36.hg19.probeset.csv") as f:
    for i, line in enumerate(f):
        if i < 24:
            pass
        else:
            line = line.rstrip().split(",")
            line = [unquote(i) for i in line]
            probeset_id, transcription_id = line[0], line[6]
            try:
                trans_gene_id[transcription_id] = probeset_geneid[probeset_id]
            except:
                counter += 1

In [27]:
trans_gene_id

{'2315125': 'OR4F5',
 '2315251': 'OR4F29',
 '2315554': 'TTLL10',
 '2315633': 'B3GALT6',
 '2315674': 'SCNN1D',
 '2315739': 'PUSL1',
 '2315773': 'CPTP',
 '2315786': 'TAS1R3',
 '2315880': 'TMEM88B',
 '2315894': 'VWA1',
 '2315918': 'ATAD3C',
 '2315942': 'ATAD3C',
 '2315951': 'ATAD3A',
 '2316218': 'CALML6',
 '2316245': 'PRKCZ',
 '2316343': 'AL590822.2',
 '2316345': 'AL590822.2',
 '2316347': 'AL590822.2',
 '2316350': 'AL590822.2',
 '2316379': 'SKI',
 '2316558': 'RER1',
 '2316605': 'PLCH2',
 '2316746': 'FAM213B',
 '2316905': 'ACTRT2',
 '2316953': 'PRDM16',
 '2317246': 'ARHGEF16',
 '2317317': 'TP73',
 '2317434': 'TPRG1L',
 '2317472': 'CCDC27',
 '2317498': 'SMIM1',
 '2317512': 'DFFB',
 '2317686': 'AJAP1',
 '2318086': 'KCNAB2',
 '2318132': 'KCNAB2',
 '2318157': 'RNF207',
 '2318170': 'RNF207',
 '2318220': 'HES3',
 '2318257': 'ESPN',
 '2318338': 'TAS1R1',
 '2318364': 'ZBTB48',
 '2318398': 'PHF13',
 '2318416': 'THAP3',
 '2318455': 'CAMTA1',
 '2318637': 'VAMP3',
 '2318656': 'PER3',
 '2318736': 'PARK

# Gene list without DSE

In [28]:
len(sigGenes)

1795

In [29]:
sigGeneNoDSE = [(i, trans_gene_id[j]) for (i, j) in sigGenes if j in trans_gene_id]

In [30]:
len(sigGeneNoDSE)

1263

# Disease Specific Effects

## Let us partition k most significant genes and adjust them for DSE to check if it helps improve significance.

In [59]:
import math
def computeDSE(CLEANED, RESIDUALS, genesA, genesB, genesRest):
    #print("hi", CLEANED.shape)
    DSE = [0] * CLEANED.shape[1]
    relevantGenesAIndex = [MAP_T_CLUSTER_INDEX[t_cluster_id] for _, t_cluster_id in genesA]
    relevantGenesBIndex = [MAP_T_CLUSTER_INDEX[t_cluster_id] for _, t_cluster_id in genesB]
    relevantGenesRestIndex = [MAP_T_CLUSTER_INDEX[t_cluster_id] for _, t_cluster_id in genesRest]

    for i in range(CLEANED.shape[1]):
        relevantResiduals = [RESIDUALS[j,i] for j in relevantGenesAIndex]
        DSE[i] = sum(relevantResiduals)/len(relevantResiduals)

    # copy CLEANED into another array
    DSE_ADJUSTED = np.copy(CLEANED)

    # adjust genes in set B with DSEs computed from genes in set A
    #print(len(relevantGenesBIndex + relevantGenesRestIndex))
    for i in relevantGenesBIndex + relevantGenesRestIndex:
        for j in range(CLEANED.shape[1]):
            DSE_ADJUSTED[i][j] -= DSE[j]
    return DSE_ADJUSTED, relevantGenesBIndex

def computeReverseDSE(CLEANED, RESIDUALS, genesA, genesB):
    DSE = [0] * CLEANED.shape[1]
    relevantGenesAIndex = [MAP_T_CLUSTER_INDEX[t_cluster_id] for _, t_cluster_id in genesA]
    relevantGenesBIndex = [MAP_T_CLUSTER_INDEX[t_cluster_id] for _, t_cluster_id in genesB]

    for i in range(CLEANED.shape[1]):
        relevantResiduals = [RESIDUALS[j,i] for j in relevantGenesAIndex]
        DSE[i] = sum(relevantResiduals)/len(relevantResiduals)

    # copy CLEANED into another array
    DSE_ADJUSTED = np.copy(CLEANED)

    # adjust genes in set B with DSEs computed from genes in set A
    for i in relevantGenesBIndex:
        for j in range(CLEANED.shape[1]):
            DSE_ADJUSTED[i][j] += DSE[j]
    return DSE_ADJUSTED, relevantGenesBIndex

def fractionImprovedPValues(GENE_PVALUES, GENE_PVALUES_B, relevantGenesBIndex):
    previousPValues = [GENE_PVALUES[i] for i in relevantGenesBIndex]
    currentPValues = [GENE_PVALUES_B[i] for i in relevantGenesBIndex]
    countSmaller = 0
    #print(previousPValues, currentPValues)
    for i, _ in enumerate(relevantGenesBIndex):
        if (currentPValues[i] < previousPValues[i]):
            countSmaller += 1
    return countSmaller/len(currentPValues)

def howMuchImprovedPValues(GENE_PVALUES, GENE_PVALUES_B, relevantGenesBIndex):
    previousPValues = [GENE_PVALUES[i] for i in relevantGenesBIndex]
    currentPValues = [GENE_PVALUES_B[i] for i in relevantGenesBIndex]
    countSmaller = 0
    improvements = []
    worsens = []
    for i, _ in enumerate(relevantGenesBIndex):
        if previousPValues[i] > currentPValues[i]:
            improvement = previousPValues[i] / currentPValues[i]
            improvements.append(improvement)
        else:
            worsen = currentPValues[i] / previousPValues[i]
            worsens.append(worsen)
    return sum(improvements)/len(improvements), sum(worsens)/len(improvements)
    

def check_DSE(x, DATASET, reverse = False):
    compute = computeDSE
    if reverse:
        compute = computeReverseDSE
    genes = list(zip(DATASET[5], T_CLUSTER_IDS))
    genes.sort()
    genesA = genes[:x//2]
    genesB = genes[x//2:x]
    genesRest = genes[x:]
    DSE_ADJUSTED, relevantGenesBIndex = compute(DATASET[0], DATASET[1], genesA, genesB, genesRest)
    ADJUSTED_DATASET = numpyAnalysis(DSE_ADJUSTED)
    fractionImproved = fractionImprovedPValues(DATASET[5], ADJUSTED_DATASET[5], relevantGenesBIndex)
    averageImprovement = howMuchImprovedPValues(DATASET[5], ADJUSTED_DATASET[5], relevantGenesBIndex)
    print("average improvement is {}".format(averageImprovement))
    return fractionImproved

In [66]:
def give_adjusted_dataset(x, DATASET):
    genes = list(zip(DATASET[5], T_CLUSTER_IDS))
    genes.sort()
    print(genes[:10])
    genesA = genes[:x//2]
    genesB = genes[x//2:x]
    genesRest = genes[x:]
    DSE_ADJUSTED, relevantGenesBIndex = computeDSE(DATASET[0], DATASET[1], genesA, genesB, genesRest)
    ADJUSTED_DATASET = numpyAnalysis(DSE_ADJUSTED)
    return ADJUSTED_DATASET, relevantGenesBIndex

In [67]:
def give_sig_genes(x, DATASET):
    genes = list(zip(DATASET[5], T_CLUSTER_IDS))
    genes.sort()
    genesA = genes[:x//2]
    genesB = genes[x//2:x]
    genesRest = genes[x:]
    #sig_Genes1 = genesForAlpha(computeBenjhoch(DATASET), ALPHA)
    DSE_ADJUSTED, relevantGenesBIndex = computeDSE(DATASET[0], DATASET[1], genesA, genesB, genesRest)
    ADJUSTED_DATASET = numpyAnalysis(DSE_ADJUSTED)
    sigGenes2 = genesForAlpha(computeBenjhoch(ADJUSTED_DATASET), ALPHA)
    noSigGenes = len(sigGenes2)
    print("number of genes at alpha", ALPHA, "is", noSigGenes)
    return [(i, trans_gene_id[j]) for (i, j) in sigGenes2 if j in trans_gene_id]

In [107]:
check_DSE(256, QUANT_ADJUSTED)

average improvement is (4.6687560491590965, 7.2736182581731974)


0.671875

In [69]:
sigGeneDSE = give_sig_genes(256, QUANT_ADJUSTED)

number of genes at alpha 0.001 is 1650


In [71]:
print(len(sigGeneDSE))

1122


In [72]:
print(len(sigGeneNoDSE))

1263


# Inclusions

In [40]:
noDSE = {k[1] for k in sigGeneNoDSE}

In [41]:
DSE = {k[1] for k in sigGeneDSE}

# Counts before and after ISE

In [76]:
DSE_ADJUSTED, genes_B_index = give_adjusted_dataset(256, QUANT_ADJUSTED)

[(2.6385684330044863e-17, '2485112'), (3.20364407481363e-16, '2775909'), (3.2431180641952277e-16, '3342426'), (1.1349508251687626e-14, '3033397'), (1.2893396941173192e-14, '3333309'), (1.5653114639306751e-14, '3978518'), (1.5735177747311401e-14, '3345157'), (1.7418412047949126e-14, '3113202'), (5.8557532625833504e-14, '2948648'), (1.1627467317302875e-13, '3528013')]


In [83]:
# sss schema
#(CLEANED, RESIDUALS, RESIDUALS_PVALUES, MODAL_ALLELE, ISEs, GENE_PVALUES)

In [103]:
DSE_good_i = []
for i, p in enumerate(DSE_ADJUSTED[5]):
    if p < 0.000000183:
        DSE_good_i.append(i)

In [105]:
QUANT_good_i = []
for i, p in enumerate(QUANT_ADJUSTED[5]):
    if p < 0.000000183:
        QUANT_good_i.append(i)

In [104]:
len([DSE_good_i)

500

In [106]:
len(QUANT_good_i)

500

In [111]:
len(set(DSE_good_i).intersection(set(QUANT_good_i)))

441

In [115]:
trans_gene_id

{'2315125': 'OR4F5',
 '2315251': 'OR4F29',
 '2315554': 'TTLL10',
 '2315633': 'B3GALT6',
 '2315674': 'SCNN1D',
 '2315739': 'PUSL1',
 '2315773': 'CPTP',
 '2315786': 'TAS1R3',
 '2315880': 'TMEM88B',
 '2315894': 'VWA1',
 '2315918': 'ATAD3C',
 '2315942': 'ATAD3C',
 '2315951': 'ATAD3A',
 '2316218': 'CALML6',
 '2316245': 'PRKCZ',
 '2316343': 'AL590822.2',
 '2316345': 'AL590822.2',
 '2316347': 'AL590822.2',
 '2316350': 'AL590822.2',
 '2316379': 'SKI',
 '2316558': 'RER1',
 '2316605': 'PLCH2',
 '2316746': 'FAM213B',
 '2316905': 'ACTRT2',
 '2316953': 'PRDM16',
 '2317246': 'ARHGEF16',
 '2317317': 'TP73',
 '2317434': 'TPRG1L',
 '2317472': 'CCDC27',
 '2317498': 'SMIM1',
 '2317512': 'DFFB',
 '2317686': 'AJAP1',
 '2318086': 'KCNAB2',
 '2318132': 'KCNAB2',
 '2318157': 'RNF207',
 '2318170': 'RNF207',
 '2318220': 'HES3',
 '2318257': 'ESPN',
 '2318338': 'TAS1R1',
 '2318364': 'ZBTB48',
 '2318398': 'PHF13',
 '2318416': 'THAP3',
 '2318455': 'CAMTA1',
 '2318637': 'VAMP3',
 '2318656': 'PER3',
 '2318736': 'PARK

In [117]:
DSE_translated_result = []
for i in DSE_good_i:
    gene_id = T_CLUSTER_IDS[i]
    if gene_id in trans_gene_id:
        DSE_translated_result.append(trans_gene_id[gene_id])

In [118]:
QUANT_translated_result = []
for i in QUANT_good_i:
    gene_id = T_CLUSTER_IDS[i]
    if gene_id in trans_gene_id:
        QUANT_translated_result.append(trans_gene_id[gene_id])

In [119]:
len(QUANT_translated_result)

339

In [122]:
for i in QUANT_translated_result:
    print(i)

PRKCZ
SKI
RBP7
UBE4B
AGTRAP
MDS2
HS2ST1
SLC50A1
BGLAP
TOR3A
LGR6
SOX13
IARS2
HLX
ARF1
CHRM3
ENO1
PHC2
PSMB2
COL9A2
TTC22
PSMA5
SF3B4
POGZ
GATAD2B
MEF2D
PLXNA2
SLC30A1
BPNT1
CRIPT
OTX1
PLEK
ANXA4
CCT7
MTHFD2
CNNM3
PELI1
DUSP11
SAP130
GALNT3
STK17B
NMUR1
MYD88
VIPR1
RBM5
DNAH1
PRKCD
GRAMD1C
SLC15A2
NUDT16
AC022498.1
LPP
SEC13
SEC22C
FOXP1
HCLS1
KIAA1257
TMCC1
ZMAT3
KLHL6
ATP13A3
PPP1R2
KIT
AREG
PDLIM5
SAP30
RELL1
TLR10
PLAC8
HPSE
SLC39A8
TIFA
ANXA5
PDGFC
TRIP13
MTRR
FAM151B
JADE2
NDST1
ERGIC1
CLPTM1L
FAM173B
ATOX1
TIMD4
CCNJL
RNF44
WRNIP1
HIST1H4I
ZNRD1
RUNX2
NEDD9
HIST1H3B
HIST1H2BB
HIST1H2BG
HIST1H4L
NEU1
MED20
PLA2G7
GABRR2
UBE2J1
ZDHHC4
ZMIZ2
ARPC1B
TTC26
INSIG1
RBM33
ELMO1
VOPP1
RFC2
SRI
GIGYF1
HIPK2
PROSC
HGSNAT
SDC2
NOV
ZHX2
TG
PPP2CB
BRF2
ATP6V1H
RIDA
CD274
KDM4C
DNAJB5
CLTA
FOXD4L3
TGFBR1
INVS
MRPL41
PLGRKT
ABHD17B
ZCCHC6
IPPK
TNFSF8
MEGF9
PSMD5
STOM
PSMB7
EDF1
FBXO18
CELF2
ECHDC3
MCU
CHCHD1
HHEX
ABCC2
HPS6
GSTO1
ACSL5
ACBD5
MICU1
P4HA1
ACTA2
PIK3AP1
COX15
ACTR1A
IFITM1
SMPD1
IL

In [120]:
len(DSE_translated_result)

310

In [123]:
for i in DSE_translated_result:
    print(i)

PRKCZ
SKI
RBP7
UBE4B
AGTRAP
HS2ST1
SLC50A1
TOR3A
LGR6
IARS2
HLX
CHRM3
ENO1
TMEM234
PHC2
PSMB2
SF3B4
POGZ
GATAD2B
MEF2D
PLXNA2
SLC30A1
BPNT1
CRIPT
OTX1
PLEK
ANXA4
MTHFD2
CNNM3
PELI1
DUSP11
SAP130
GALNT3
STK17B
NMUR1
MYD88
VIPR1
RBM5
DNAH1
PRKCD
GRAMD1C
SLC15A2
NUDT16
AC022498.1
LPP
SEC13
SEC22C
TWF2
RFT1
FOXP1
HCLS1
KIAA1257
TMCC1
ZMAT3
KLHL6
ATP13A3
PPP1R2
KIT
AREG
PDLIM5
SAP30
RELL1
TLR10
PLAC8
HPSE
TIFA
ANXA5
PDGFC
MTRR
LRRC70
FAM151B
JADE2
NDST1
CNOT8
ERGIC1
CLPTM1L
FAM173B
ATOX1
ADAM19
CCNJL
RNF44
LMAN2
HIST1H4I
LY6G5B
RUNX2
NEDD9
HIST1H3B
HIST1H2BG
HIST1H4L
CLIC1
NEU1
MED20
PLA2G7
GABRR2
UBE2J1
ZMIZ2
ARPC1B
TTC26
INSIG1
RBM33
ELMO1
VOPP1
SRI
GIGYF1
HIPK2
PROSC
HGSNAT
NOV
ZHX2
TG
DENND3
PPP2CB
BRF2
ATP6V1H
CD274
KDM4C
DNAJB5
CLTA
FOXD4L3
TGFBR1
INVS
MRPL41
PLGRKT
ZCCHC6
IPPK
TNFSF8
MEGF9
PSMD5
STOM
CELF2
ECHDC3
SEC61A2
MCU
CHCHD1
HHEX
ABCC2
GSTO1
ACSL5
ACBD5
MICU1
P4HA1
ACTA2
PIK3AP1
COX15
ACTR1A
IFITM1
SMPD1
ILK
ADM
AMPD3
PDE3B
CD44
CD82
OR4D10
BEST1
POLR2G
FERMT3
PPFIA1
ACER3
DDI

In [98]:
def geneNamesatAlpha(experiment, alpha):
    genes = genesForAlpha(computeBenjhoch(experiment), alpha)
    return [(i, trans_gene_id[j]) for (i, j) in genes if j in trans_gene_id]

In [99]:
def debugGeneNamesatAlpha(experiment, alpha):
    genes = genesForAlpha(computeBenjhoch(experiment), alpha)
    return genes

In [106]:
thresholds = [0.000003, 0.001, 0.01]

In [120]:
fractionImproved = fractionImprovedPValues(QUANT_ADJUSTED[5], DSE_ADJUSTED[5], genes_B_index)

In [121]:
fractionImproved

0.671875

In [107]:
v = []
for alpha in thresholds:
    v.append(geneNamesatAlpha(QUANT_ADJUSTED, alpha))

In [102]:
vv = []
for alpha in thresholds:
    vv.append(debugGeneNamesatAlpha(QUANT_ADJUSTED, alpha))

In [108]:
[len(i) for i in v]

[256, 1263, 2562]

In [109]:
w = []
for alpha in thresholds:
    w.append(geneNamesatAlpha(DSE_ADJUSTED, alpha))

In [103]:
ww = []
for alpha in thresholds:
    ww.append(geneNamesatAlpha(DSE_ADJUSTED, alpha))

In [65]:
ww

NameError: name 'ww' is not defined

In [110]:
[len(i) for i in w]

[244, 1122, 2170]

In [194]:
t = 0.00000020

In [195]:
len(sorted([i for i in DSE_ADJUSTED[5] if i <= t]))

511

In [196]:
len(sorted([i for i in QUANT_ADJUSTED[5] if i <= t]))

510

In [36]:
noDSE.difference(DSE)

set()

In [37]:
DSE.difference(noDSE)

set()

In [126]:
import random
def check_DSE_random(x, DATASET):
    genes = list(zip(DATASET[5], T_CLUSTER_IDS))
    genes.sort()
    randomFudge = genes[:]
    random.shuffle(randomFudge)
    genesA = randomFudge[:x//2]
    genesB = genes[x//2:x]
    DSE_ADJUSTED, relevantGenesBIndex = computeDSE(DATASET[0], DATASET[1], genesA, genesB)
    ADJUSTED_DATASET = numpyAnalysis(DSE_ADJUSTED)
    fractionImproved = fractionImprovedPValues(DATASET[5], ADJUSTED_DATASET[5], relevantGenesBIndex)
    return fractionImproved

In [127]:
experiments = [2**i for i in range(5, 15)] + [len(QUANT_ADJUSTED[5])]
experiments

[32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 22011]

In [125]:
results = {}
for i in experiments:
    results[i] = check_DSE(i, QUANT_ADJUSTED)

average improvement is (39.451256901845923, 109.97416696884891)
average improvement is (10.151984108019013, 61.479712269376186)
average improvement is (4.739087470252362, 15.67288576662455)
average improvement is (4.6687560491590965, 7.2736182581731974)
average improvement is (4.7796659715121548, 7.7489219751461906)
average improvement is (5.8122763709881866, 5.3261234284290673)
average improvement is (2.7531586860884492, 4.1384224952246953)
average improvement is (1.4259366604012378, 3.1045187674594179)
average improvement is (1.1569001292593364, 2.2154553350053718)
average improvement is (1.0274075422901201, 2.042124159202177)
average improvement is (1.0077232030375038, 2.0125679268555254)


In [None]:
results

In [None]:
import time
start = time.time()
resultsRandom = {}
testRange = experiments
for i in testRange:
    resultsRandom[i] = []
for j in range(500):
    for i in testRange:
        res = check_DSE_random(i, QUANT_ADJUSTED)
        resultsRandom[i].append(res)
stop = time.time()

In [None]:
print(stop - start)

In [None]:
resultsRandom

In [None]:
stop - start

# For ALL check which batch the data comes from:

In [None]:
index_to_batch = []
if CEL_LOCATION.split("/")[-1] == "ALL":
    for file in CELFILES:
        index_to_batch.append(file.split(".")[0].split("_")[-1][0])

# Write results to a file

In [None]:
import json
with open(CEL_LOCATION + "/results", "w") as f:
    json.dump({
        "QA": computeBenjhoch(QUANT_ADJUSTED),
        "QA_raw": [list(key) for key in np.transpose(QUANT_ADJUSTED[0])],
        "MODAL_ALLELE": MODAL_ALLELE,
        "GENDER" : SEX,
        "labels": index_to_batch,
        "QA_P" : computeP(QUANT_ADJUSTED),
        "NON_ADJUSTED": computeBenjhoch(NON_ADJUSTED),
        "NON_ADJUSTED_P": computeP(NON_ADJUSTED),
        "MEAN_ADJUSTED": computeBenjhoch(MEAN_ADJUSTED),
        "MEAN_ADJUSTED_P": computeP(MEAN_ADJUSTED),
        "results" : results,
        "resultsRandom": resultsRandom
    }, f)