In [1]:
import numpy as np
import csv
import matplotlib.pyplot as plt
import scipy
import scipy.stats
import math

In [2]:
class MedianPolish:
    """Fits an additive model using Tukey's median polish algorithm"""

    #note that the self argument is default that allows reference to an instance of the
    #class that has been defined by the user
    #no argument is actally specified to be passed to the self argument by the user;
    #the first user specified argument
    #goes into the first argument listed after self in the code of the class
    def __init__(self, array): 
        """Get numeric data from numpy ndarray to self.tbl, keep the original copy in tbl_org"""
        #checks if the argument called array is actually a numerical array of the
        #object type np.ndarray by using the comparison function isinstance
        if isinstance(array, np.ndarray):
            self.tbl_org = array
            #tbl_org will allow you to compare the final polished matrix with row and column
            #effects removed to the original unpolished matrix
            self.tbl = self.tbl_org.copy()
        else:
            raise TypeError('Expected the argument to be a numpy.ndarray.')

    @staticmethod
    def csv_to_ndarray(fname): 
        """ Utility method for loading ndarray from .csv file""" 
        try:
            #generates an array-like object of type np.ndarray
            #from a comma separated values file
            return np.genfromtxt(fname, delimiter=",")	
        except Exception, e:
            print "Error loading file %s:" % fname
            raise

    def median_polish(self, max_iterations, method):
        """
            Implements Tukey's median polish alghoritm for additive models
            method - default is median, alternative is mean. That would give us result equal ANOVA.
        """
        
        grand_effect = 0
        median_row_effects = 0
        median_col_effects = 0
        #defines a vector that stores the row_effects during each iteration;
        #the vector is initialized with zeros and length equal to the number of rows
        #as determined by the function shape[0]
        row_effects = np.zeros(shape=self.tbl.shape[0])
        col_effects = np.zeros(shape=self.tbl.shape[1])

        for i in range(max_iterations):
            if method == 'median':
                #note that np.median's second arg specifies along which axis to perform
                #the median calculation, with axis = 1 being rows, and axis = 0 being columns
                row_medians = np.median(self.tbl, 1)
                #the next line keeps a running total of the row effects that have been
                #subtracted out during the iterative polishing procedure
                row_effects += row_medians
                median_row_effects = np.median(row_effects)
            elif method == 'average':
                row_medians = np.average(self.tbl, 1) 
                row_effects += row_medians
                median_row_effects = np.average(row_effects)
            
            #not sure what the grand_effect variable is keeping track of; it is
            #returned but never used in later calculations
            grand_effect += median_row_effects
            
            #the following line I have commented out, don't know why it is needed
            #row_effects -= median_row_effects
            
            #the following line reshapes the the row_medians array into a column
            #this reshaping is necessary because each element in row_medians
            #is the median from each row, which needs to be subtracted from each row
            #by reshaping into a column, can simply subtract this column from each
            #column of the data matrix, self.tbl
            #the np.newaxis function adds a new dimension, such that now the array
            #is a matrix with n number of rows and 1 column; hence the array is now a
            #column vector.
            self.tbl -= row_medians[:, np.newaxis]

            if method == 'median':
                col_medians = np.median(self.tbl, 0) 
                col_effects += col_medians
                median_col_effects = np.median(col_effects)
            elif method == 'average':
                col_medians = np.average(self.tbl, 0) 
                col_effects += col_medians
                median_col_effects = np.average(col_effects)

            #note that by default, arrays such as col_medians are formulated as a row vector
            #for use in numerical calculations
            self.tbl -= col_medians
            
            #the following line I have commented out, don't know why it is needed
            #col_effects -= col_medians
            
            grand_effect += median_col_effects

        return grand_effect, col_effects, row_effects , self.tbl, self.tbl_org

In [3]:
#perform missing data imputation on a screen-by-screen basis,
#where a screen is considered to be all screens done in the same day/batch
#of cells

"""KNN (k nearest neighbors) missing data imputation using Euclidean distance to select k
nearest neighbors and using weight averages for the estimation of the missing value.
Missing values are imputed starting with the first column and going down each row in the
first column, then repeating with the next column to the right in the data table."""
#chose 10 nearest neighbors because in a given screen not likely has more than 10 or so hits
#and so only about 10 or so neighboring hits would be informative for imputing a putative
#hit that is missing a value.  for putative non-hits, there are many neighbors with similar
#values and so 10 is still a good number
K_NEAREST_NEIGHBORS = 10

NUM_REPL = 2

data_file = open('/Users/markfang/Dropbox/UCSD Grad work/RNA-Yeo Lab/Data - analyses/SG project/screening/granule area to nuclear area analyses of large screens/150610 cvb smnpc spectrum screen rewashed/before.csv', 'rU')
data_table = np.genfromtxt(data_file, delimiter=',')

#the actual data is listed in the following rows and columns; the rest are
#data table label strings
FIRSTROW = 4
LASTROW = 388
FIRSTCOLUMN = 1
LASTCOLUMN = 15

data_table = data_table[FIRSTROW:LASTROW, FIRSTCOLUMN:LASTCOLUMN]

original_numrows = data_table.shape[0]
original_numcols = data_table.shape[1]

data_table = np.reshape(np.ravel(data_table, order = 'F'), (original_numrows * original_numcols / NUM_REPL, NUM_REPL), order = 'F')

numrows = data_table.shape[0]
numcols = data_table.shape[1]

for j in range(numcols):
    for i in range(numrows):
        #find missing values by checking for 'nan' value in ndarray
        if np.isnan(data_table[i][j]):
            #for determining Euclidean distances from the small molecule compound which is
            #missing value in 1 replicate to all the other vectors containing data for
            #all the other small molecule compounds,
            #we ignore the values in the replicate in which there is the missing value
            other_vectors = np.delete(data_table, j, axis = 1)
            
            #initialize the small molecule compound for which
            #there is a missing value for 1 replicate; this vector
            #will be used as the basis to compare Euclidean distance to the other_vectors
            vector_missingval = other_vectors[i:(i + 1), 0:numcols]
            
            #initialize a vector containing the data in the same repl in which there is
            #the missing value; the missing value will be imputed by a weighted average
            #of the other data in this replicate
            estimating_vals = data_table[0:numrows, j:(j + 1)]
            
            other_vectors = np.delete(other_vectors, i, axis = 0)
            estimating_vals = np.delete(estimating_vals, i, axis = 0)
            
            nan_estimator_flag = 1
            
            #remove other rows in which the estimating value is also missing
            while(nan_estimator_flag == 1):
                nan_estimator_flag = 0
                for x in range(len(estimating_vals)):
                    if (np.isnan(estimating_vals[x])):
                        nan_estimator_flag = 1
                        row_with_nan = x
                
                if(nan_estimator_flag == 1):
                    other_vectors = np.delete(other_vectors, row_with_nan, axis = 0)
                    estimating_vals = np.delete(estimating_vals, row_with_nan, axis = 0)
            
            #the following vector will store the Euclidean distances
            euclid_dist = np.zeros((other_vectors.shape[0], 1))

            for k in range(other_vectors.shape[0]):
                flag_nanvector = 0  
                
                for l in range(other_vectors.shape[1]):
                    if (np.isnan(other_vectors[k][l]) == False) and (np.isnan(vector_missingval[0][l]) == False):
                        #calculate Euclidean distance as defined in Troyanskaya et al (2001)
                        euclid_dist[k][0] += (other_vectors[k][l] - vector_missingval[0][l]) ** 2
                        
                        flag_nanvector = 1
                
                if flag_nanvector == 0:
                    #nans are always ranked last
                    euclid_dist[k][0] = data_table[i][j]
            
            #the similarity score calculated below and as defined in Troyanskaya et al (2001)
            #will serve as the weights for imputing the missing value based on weighted
            #average of the remaining - observed - data in the replicate missing the value
            similarity = 1 / euclid_dist
            
            #rank the other small molecule compounds in terms of their Euclidean distance
            #from the small molecule compound with the missing value.  select the k nearest
            #neighbors in terms of Euclidean distance
            simil_rank = scipy.stats.rankdata(euclid_dist, method = "ordinal")
            
            sum_simil = 0
            weighted_avg = 0
            for m in range(other_vectors.shape[0]):
                if simil_rank[m] <= K_NEAREST_NEIGHBORS:
                    sum_simil += similarity[m]
                    weighted_avg += similarity[m] * estimating_vals[m]
            data_table[i][j] = weighted_avg/sum_simil

#reform the original data table, now with the missing values filled in
data_table = np.reshape(np.ravel(data_table, order = 'F'), (original_numrows, original_numcols), order = 'F')

print(data_table)

#with open ('/Users/markfang/Dropbox/UCSD Grad work/RNA-Yeo Lab/Data - analyses/SG project/screening/granule area to nuclear area analyses of large screens/150610 cvb smnpc spectrum screen rewashed/missingval filled.csv','w') as filled_outputfile:
#    writer = csv.writer(filled_outputfile, lineterminator = '\n')
#    writer.writerows(data_table)

[[  1.14814446e-01   4.11857158e-01   1.29479332e-01 ...,   1.77640104e-01
    1.95734697e-01   1.79616005e-01]
 [  9.56101080e-02   3.61141691e-01   1.91803373e-01 ...,   1.80564764e-01
    1.71517724e-01   1.65645407e-01]
 [  7.48222030e-02   3.05343568e-01   1.73314003e-01 ...,   1.68256609e-01
    1.68432365e-01   1.66423707e-01]
 ..., 
 [  7.33425360e-02   3.70047657e+02   2.14538968e-01 ...,   1.95319587e-01
    2.26943845e-01   2.17638225e-01]
 [  8.43901350e-02   2.05667344e-01   1.97414161e-01 ...,   2.01141672e-01
    2.16028112e-01   1.94816804e-01]
 [  8.69914080e-02   1.82278028e+02   1.97877942e-01 ...,   1.82997519e-01
    1.97496004e-01   1.86764418e-01]]


In [4]:
#perform polish and calculate b scores on a plate-by-plate basis
#where each plate's data is listed as a separate column in the original csv

if __name__ == "__main__":

    #number of iterations to run Tukey's two median polish
    NUM_POLISH_ITER = 10
    
    #number of plates, rows, columns
    PLATES = 14
    PLATE_ROWS = 16
    PLATE_COLS = 24
    
    #makes output more legible for debugging
    np.set_printoptions(precision = 5, suppress = True)

    num_rows = data_table.shape[0]
    num_cols = data_table.shape[1]
    
    #the data table needs to be reshaped into plates with rows and columns
    #this way I can run the polish on each of the plates.  the next few lines
    #accomplish this reshaping
    #note that by entering a 1-tuple for size, reshape will make a 1D array; no
    #need to actually specify that the array has 1 row or 1 col, as functionally
    #np will treat that as a 2D array with 1 row or 1 col
    data_table = np.reshape(data_table, (num_rows * num_cols), order = 'F')

    #reshape into data organized by plate, row, col
    data_table = np.reshape(data_table, (PLATES, PLATE_ROWS, PLATE_COLS), order = 'C')

    bscores = []
    bscores_printed = np.zeros((PLATE_ROWS * PLATE_COLS, PLATES))
    resid_printed = np.zeros((PLATE_ROWS * PLATE_COLS, PLATES))
    
    #perform polish and calculate b scores on a plate-by-plate basis
    for i in xrange(0, PLATES):
    
        #iterate over each plate and perform the median polish
        arr = data_table[i, :, :]

        tbl_avg = np.average(arr)
        #subtract out the average for each plate, thus normalizing out plate effects
        arr -= tbl_avg
        #pass each plate's data matrix into the MedianPolish object
        mp = MedianPolish(arr)

        #first argument indicates number of iterations to be run
        #ce is an ndarray storing the column effects after n iterations of polishing
        #re is an ndarray storing the row effects after n iterations of polishing
        #resid is the data table that has been polished to remove 
        ge, ce, re, resid, tbl_org =  mp.median_polish(NUM_POLISH_ITER, "median") 

        re_reshape = re[:, np.newaxis]
        
        #the tbl_org returned by mmp.median_polish has had the tbl_avg subtracted,
        #so to get the initial data table back need to add the tbl_avg back
        tbl_org += tbl_avg

        #the next few lines compute the median absolute deviation
        #MAD = median(|x - median(x)|)
        tbl_resid_minusmedians = resid - np.median(resid)
        median_absdev = np.median(np.absolute(tbl_resid_minusmedians))
        
        #find the b scores of the plate
        tbl_bscore = resid / median_absdev
        
        tbl_bscore_toprint = np.reshape(np.ravel(tbl_bscore, order = 'C'), (PLATE_ROWS * PLATE_COLS), order = 'F')
        bscores_printed[:, i] = tbl_bscore_toprint
        
        resid_toprint = np.reshape(np.ravel(resid, order = 'C'), (PLATE_ROWS * PLATE_COLS), order = 'F')
        resid_printed[:, i] = resid_toprint
        
        #convert tbl_bscore from ndarray to simple list
        tbl_bscore = tbl_bscore.tolist()
        
        #collect the b scores of each plate into the array bscores
        bscores.append(tbl_bscore)
    
    bscores = np.asarray(bscores)
    
    print(bscores_printed)
    
#    with open ('/Users/markfang/Dropbox/UCSD Grad work/RNA-Yeo Lab/Data - analyses/SG project/screening/granule area to nuclear area analyses of large screens/150610 cvb smnpc spectrum screen rewashed/b scores.csv','w') as bscores_outputfile:
#        writer = csv.writer(bscores_outputfile, lineterminator = '\n')
#        writer.writerows(bscores_printed)
        
#    with open ('/Users/markfang/Dropbox/UCSD Grad work/RNA-Yeo Lab/Data - analyses/SG project/screening/granule area to nuclear area analyses of large screens/150610 cvb smnpc spectrum screen rewashed/after.csv','w') as afterpolish_outputfile:
#        writer = csv.writer(afterpolish_outputfile, lineterminator = '\n')
#        writer.writerows(resid_printed)


[[     2.04251    -14.37276     -4.10163 ...,      2.4048       4.35924
       4.03549]
 [    -2.93128     -0.54506      1.20778 ...,      1.84737      1.2637
       0.48307]
 [    -5.17089     -0.47661     -0.4671  ...,      0.85046     -0.00563
      -0.00873]
 ..., 
 [    -2.80828  13543.49988     -0.06561 ...,     -1.49032      0.23867
      -0.86566]
 [    -0.94212    -14.03389     -2.33834 ...,     -1.74661     -1.15266
      -3.83698]
 [     4.31385   6661.94559      1.04225 ...,      1.17598      1.33829
       0.92417]]


In [5]:
#select hits based on median and median absolute deviation on a screen-by-screen
#basis, where a screen is considered to be all screens done in the same day/batch
#of cells
    
#1.4826 is scaling constant to make 1 MAD comparable in magnitude to 1 SD
#see (chung, strulovici et al 2007)
MAD_SCALING_CONST = 1.4826
INDEX_OFFSET = 1
#2 MAD corresponds to false positive rate of 0.023 under a normal distribution
THRESHOLD = 2
    
bscores_median = np.median(bscores)

bscores_copy = bscores
#subtract the overall median; this is later used to calculate the MAD
for i in range(0, PLATES):
    for j in range(0, PLATE_ROWS):
        for k in range(0, PLATE_COLS):
            bscores_copy[i][j][k] -= bscores_median

mad = MAD_SCALING_CONST * np.median(np.absolute(bscores_copy))
    
upper_threshold = bscores_median + THRESHOLD * mad
lower_threshold = bscores_median - THRESHOLD * mad

hits_mad = []
sg_enhancers_mad = []
    
unique_plates = PLATES / NUM_REPL

for i in range(0, unique_plates):
    for j in range(0, PLATE_ROWS):
        for k in range(0, PLATE_COLS):
            #find median of replicates, then check if it is +/- 3 MAD
            median_ofrepl = np.median([bscores[i][j][k], bscores[i + unique_plates][j][k]])
            if median_ofrepl < lower_threshold:
                #collect plate, row, column coordinates of hits
                #indexed from 1 rather than 0
                hits_mad.append(i + INDEX_OFFSET)
                hits_mad.append(j + INDEX_OFFSET)
                hits_mad.append(k + INDEX_OFFSET)
            if median_ofrepl > upper_threshold:
                sg_enhancers_mad.append(i + INDEX_OFFSET)
                sg_enhancers_mad.append(j + INDEX_OFFSET)
                sg_enhancers_mad.append(k + INDEX_OFFSET)


len_hits_mad = len(hits_mad)
hits_mad = np.asarray(hits_mad)

#reshape into ordered triples giving the plate, row, and column coordinates for hits
hits_mad = np.reshape(hits_mad, (len_hits_mad / 3, 3), order = 'C')
print(hits_mad)

len_sg_enhancers_mad = len(sg_enhancers_mad)
sg_enhancers_mad = np.asarray(sg_enhancers_mad)
sg_enhancers_mad = np.reshape(sg_enhancers_mad, (len_sg_enhancers_mad / 3, 3), order = 'C')
print(sg_enhancers_mad)
    
#with open ('/Users/markfang/Dropbox/UCSD Grad work/RNA-Yeo Lab/Data - analyses/SG project/screening/granule area to nuclear area analyses of large screens/150610 cvb smnpc spectrum screen rewashed/hits mad.csv','w') as hits_mad_outputfile:
    #writer = csv.writer(hits_mad_outputfile, lineterminator = '\n')
    #writer.writerows(hits_mad)
        
#with open ('/Users/markfang/Dropbox/UCSD Grad work/RNA-Yeo Lab/Data - analyses/SG project/screening/granule area to nuclear area analyses of large screens/150610 cvb smnpc spectrum screen rewashed/sg enhancers mad.csv','w') as sgenh_mad_outputfile:
    #writer = csv.writer(sgenh_mad_outputfile, lineterminator = '\n')
    #writer.writerows(sg_enhancers_mad)


[[ 1  1  3]
 [ 1  1 18]
 [ 1  2 17]
 [ 1  3  1]
 [ 1  3  9]
 [ 1  3 10]
 [ 1  4  5]
 [ 1  5 13]
 [ 1  5 15]
 [ 1  5 23]
 [ 1  6 20]
 [ 1  6 23]
 [ 1  7 10]
 [ 1  7 13]
 [ 1  7 14]
 [ 1  8 22]
 [ 1  9  2]
 [ 1 10  7]
 [ 1 10 11]
 [ 1 10 12]
 [ 1 10 16]
 [ 1 11  1]
 [ 1 11 12]
 [ 1 11 21]
 [ 1 12  5]
 [ 1 13  6]
 [ 1 13 11]
 [ 1 14  6]
 [ 1 14 18]
 [ 1 15  3]
 [ 1 15  4]
 [ 1 15  9]
 [ 1 15 13]
 [ 1 16  9]
 [ 1 16 22]
 [ 2  1  1]
 [ 2  2  1]
 [ 2  3  1]
 [ 2  4  1]
 [ 2  4 13]
 [ 2  4 24]
 [ 2  5  1]
 [ 2  5 10]
 [ 2  6  1]
 [ 2  6  5]
 [ 2  7  1]
 [ 2  7 10]
 [ 2  7 17]
 [ 2  9  7]
 [ 2  9 12]
 [ 2 11  5]
 [ 2 12  5]
 [ 2 14  4]
 [ 2 14  7]
 [ 2 14  8]
 [ 2 15  5]
 [ 2 15  7]
 [ 2 15  9]
 [ 2 15 10]
 [ 2 16  6]
 [ 2 16  7]
 [ 2 16  8]
 [ 2 16 17]
 [ 2 16 20]
 [ 2 16 23]
 [ 3  1 15]
 [ 3  2 21]
 [ 3  2 22]
 [ 3  3 18]
 [ 3  4 17]
 [ 3  4 21]
 [ 3  5  3]
 [ 3  5 18]
 [ 3  5 22]
 [ 3  6  7]
 [ 3  6  9]
 [ 3  6 10]
 [ 3  6 16]
 [ 3  7  1]
 [ 3  8  1]
 [ 3  9  1]
 [ 3  9  8]
 [ 3  9 18]
 [ 3

In [6]:
#fit variances to random variance model (inverse gamma distributed) on a screen-by-screen
#basis, where a screen is considered to be all screens done in the same day/batch
#of cells

#number of experimental and control groups
NUM_GROUPS = 1

variances = []
averages = []

for i in range(0, unique_plates):
    for j in range (0, PLATE_ROWS):
        for k in range (0, PLATE_COLS):
            first_repl = bscores[i][j][k]
            second_repl = bscores[i + unique_plates][j][k]
            sample_avg = (first_repl + second_repl) / NUM_REPL
            sample_variance = ((first_repl - sample_avg) ** 2 + (second_repl - sample_avg) ** 2) / (NUM_REPL - 1)
            variances.append(sample_variance)
            averages.append(sample_avg)

#according to paper wright and simon 2003, the sample variances multipled by two constants
#a and b follow an F distribution with parameters (n-k) and 2a, where n is number of
#replicates, k is number of group (experimental, control, etc)
#in my data, I have duplicates and 1 group, so n = 2, k = 1.
param = scipy.stats.f.fit(variances, f0 = NUM_REPL - NUM_GROUPS)
print(param)

#after fitting we want to find the value of a and b, since these are the parameters for
#the putative inverse gamma distribution that is the true distribution of the variances
#of the small molecule screen.  finding a and b will help us specify the inverse
#gamma distribution, which will improve the power of our t tests (wright and simon 2003)
#find parameter a: since the fitted distribution has parameters (n-k) and 2a, we can
#take the second parameter and divide by 2 to get a
invgammaparam_a = param[1] / 2

#we fit an F distribution to our variances, and we see that the scaling s is stored in the
#fourth parameter.  a*b*variances fits to an F distribution with area under the curve = 1
#since F is a probability distribution (scaling = 1)
#thus when we simply fit our variances to an F distribution,
#we may get a scaling s =/= 1 (area under the curve not equal 1)
#since multiplying variates by a constant changes the scaling of the fitted F distribution
#we can figure out what a*b is by knowing that multiplying the variances
#by a*b brings the scaling up to 1; hence a*b equals the multiplicative inverse
#of the current scaling.  from here we can find b because we already have a
invgammaparam_b = (1 / param[3]) / invgammaparam_a
print(invgammaparam_a)
print(invgammaparam_b)

print(variances[0])
new_variances = [(invgammaparam_a * invgammaparam_b * x) for x in variances]
print(new_variances[0])
param = scipy.stats.f.fit(new_variances, f0 = NUM_REPL - NUM_GROUPS)
print(param)
print(variances[0])

(1, 1.4098648716778177, 3.3054561917160277e-08, 1.34976083793044)
0.704932435839
1.05098297908
0.00380558484244
0.00281945122091
(1, 1.4334532473486337, 2.4489199114594365e-08, 1.0257152190298187)
0.00380558484244


  Lhat = muhat - Shat*mu


In [7]:
#select hits based on t tests (under random variance model), on a screen-by-screen
#basis, where a screen is considered to be all screens done in the same day/batch
#of cells

#convert lists to numpy objects ndarrays to be able to easily perform math operations
variances = np.asarray(variances)
averages = np.asarray(averages)

#variances that have been fitted to the inverse gamma distribution
rvm_variances = ((NUM_REPL - 1) * variances + 2 * invgammaparam_a * (1 / (invgammaparam_a * invgammaparam_b))) / ((NUM_REPL - 1) + 2 * invgammaparam_a)          

#calculate std dev from the variances that have been fitted to the inverse gamma distribution
denominator = np.sqrt(rvm_variances / NUM_REPL)

#calculate t statistic for each compound, using the rvm_variances
t_stats = (averages - 0) / denominator

len_t_stats = t_stats.shape[0]

p_val = []
df = NUM_REPL - 1 + (2 * invgammaparam_a)

#p values for 2 tailed t tests
for i in range(0, len_t_stats):
    if t_stats[i] <= 0:
        prob = scipy.stats.t.cdf(t_stats[i], df)
        prob *= 2
        p_val.append(prob)
    else:
        prob = scipy.stats.t.sf(t_stats[i], df)
        prob *= 2
        p_val.append(prob)

p_val = np.asarray(p_val)
p_val = p_val[:, np.newaxis]

coordinates = []

#list out the plate, row, and col coordinates to be concatenated with the p values
#this helps keep track of where each p value came from in the physical location
#on the plates after the p values are sorted in order to do FDR controlling
#such as benjamini hochberg
#these are indexed from 1 not 0 for ease of interpretation (i.e. plate 1 rather than plate 0)
for i in range(0, unique_plates):
    for j in range(0, PLATE_ROWS):
        for k in range(0, PLATE_COLS):
            coordinates.append(i + INDEX_OFFSET)
            coordinates.append(j + INDEX_OFFSET)
            coordinates.append(k + INDEX_OFFSET)

coordinates = np.asarray(coordinates)
len_coord = len(coordinates)
coordinates = np.reshape(coordinates, (len_coord / 3, 3), order = 'C')

p_val_coord = np.concatenate((p_val, coordinates), axis = 1)
p_val_coord = np.ndarray.tolist(p_val_coord)

p_val_coord.sort()
print(p_val_coord)

print(t_stats)
print(averages)

[[0.0013063588913400147, 1.0, 1.0, 8.0], [0.001654965594823113, 1.0, 11.0, 21.0], [0.001757399932105321, 4.0, 11.0, 20.0], [0.002067995849852363, 1.0, 14.0, 18.0], [0.003564152530083864, 3.0, 15.0, 4.0], [0.0048690171591989845, 3.0, 12.0, 18.0], [0.005383622005568888, 1.0, 11.0, 13.0], [0.005734191542203928, 3.0, 6.0, 16.0], [0.005943691449092312, 3.0, 4.0, 21.0], [0.0072818679412913695, 3.0, 3.0, 18.0], [0.007338475667182011, 6.0, 13.0, 3.0], [0.008413424587153641, 1.0, 2.0, 17.0], [0.010101001478442468, 1.0, 12.0, 5.0], [0.01118397083033249, 1.0, 10.0, 7.0], [0.012051691014653152, 7.0, 9.0, 22.0], [0.012810154724578303, 1.0, 1.0, 3.0], [0.013045900959903527, 6.0, 14.0, 12.0], [0.013712436101821934, 2.0, 15.0, 15.0], [0.014052378192844779, 3.0, 5.0, 3.0], [0.014123660448459422, 4.0, 1.0, 5.0], [0.014296215831779811, 7.0, 3.0, 9.0], [0.016873092768626422, 2.0, 15.0, 10.0], [0.01717880229196462, 4.0, 1.0, 18.0], [0.017228068961157084, 3.0, 16.0, 9.0], [0.017323797854681994, 4.0, 11.0, 1

In [8]:
#select hits based on t tests (under random variance model), on a screen-by-screen
#basis, where a screen is considered to be all screens done in the same day/batch
#of cells

#ignoring multiple hypothesis testing and therefore not controlling for FDR
SIGMA = 0.05
hits_t_test = []
sg_enhancers_t_test = []

len_p_val_coord = len(p_val_coord)

for i in range(0, len_p_val_coord):
    if p_val_coord[i][0] < SIGMA:
        plate_position = (p_val_coord[i][1] - 1) * PLATE_ROWS * PLATE_COLS
        row_position = (p_val_coord[i][2] - 1) * PLATE_COLS
        col_position = p_val_coord[i][3] - 1
        if t_stats[plate_position + row_position + col_position] <= 0:
            hits_t_test.append(p_val_coord[i])
        else:
            sg_enhancers_t_test.append(p_val_coord[i])

print(len(hits_t_test))        
print(hits_t_test)
print(len(sg_enhancers_t_test))
print(sg_enhancers_t_test)

#with open ('/Users/markfang/Dropbox/UCSD Grad work/RNA-Yeo Lab/Data - analyses/SG project/screening/granule area to nuclear area analyses of large screens/150610 cvb smnpc spectrum screen rewashed/hits ttest.csv','w') as hits_t_test_outputfile:
    #writer = csv.writer(hits_t_test_outputfile, lineterminator = '\n')
    #writer.writerows(hits_t_test)
        
#with open ('/Users/markfang/Dropbox/UCSD Grad work/RNA-Yeo Lab/Data - analyses/SG project/screening/granule area to nuclear area analyses of large screens/150610 cvb smnpc spectrum screen rewashed/sg enhancers ttest.csv','w') as sgenh_t_test_outputfile:
    #writer = csv.writer(sgenh_t_test_outputfile, lineterminator = '\n')
    #writer.writerows(sg_enhancers_t_test)


52
[[0.001654965594823113, 1.0, 11.0, 21.0], [0.001757399932105321, 4.0, 11.0, 20.0], [0.002067995849852363, 1.0, 14.0, 18.0], [0.003564152530083864, 3.0, 15.0, 4.0], [0.0048690171591989845, 3.0, 12.0, 18.0], [0.005734191542203928, 3.0, 6.0, 16.0], [0.005943691449092312, 3.0, 4.0, 21.0], [0.0072818679412913695, 3.0, 3.0, 18.0], [0.007338475667182011, 6.0, 13.0, 3.0], [0.008413424587153641, 1.0, 2.0, 17.0], [0.010101001478442468, 1.0, 12.0, 5.0], [0.01118397083033249, 1.0, 10.0, 7.0], [0.012051691014653152, 7.0, 9.0, 22.0], [0.012810154724578303, 1.0, 1.0, 3.0], [0.013045900959903527, 6.0, 14.0, 12.0], [0.014052378192844779, 3.0, 5.0, 3.0], [0.014123660448459422, 4.0, 1.0, 5.0], [0.016873092768626422, 2.0, 15.0, 10.0], [0.01717880229196462, 4.0, 1.0, 18.0], [0.017228068961157084, 3.0, 16.0, 9.0], [0.017323797854681994, 4.0, 11.0, 15.0], [0.017563582390404137, 4.0, 4.0, 22.0], [0.018879293981446995, 3.0, 11.0, 21.0], [0.02261176375151374, 5.0, 5.0, 13.0], [0.023136607101175467, 3.0, 12.0

In [9]:
#benjamini hochberg method to control FDR
FDR = 0.05
k = 1.0
i = 0
m = unique_plates * PLATE_ROWS * PLATE_COLS

hits_fdr = []

print(k / m * FDR)

while p_val_coord[i][0] < (k / m * FDR):
    hit = p_val_coord[i]
    hits_fdr.append(hit)
    i += 1
    k += 1.0
    
print(hits_fdr)

1.86011904762e-05
[]
