In [1]:
import numpy as np
import csv
import matplotlib.pyplot as plt
import scipy
import scipy.stats
import math

In [4]:
#perform missing data imputation on a screen-by-screen basis,
#where a screen is considered to be all screens done in the same day/batch
#of cells

"""KNN (k nearest neighbors) missing data imputation using Euclidean distance to select k
nearest neighbors and using weight averages for the estimation of the missing value.
Missing values are imputed starting with the first column and going down each row in the
first column, then repeating with the next column to the right in the data table."""
#chose 10 nearest neighbors because in a given screen not likely has more than 10 or so hits
#and so only about 10 or so neighboring hits would be informative for imputing a putative
#hit that is missing a value.  for putative non-hits, there are many neighbors with similar
#values and so 10 is still a good number
K_NEAREST_NEIGHBORS = 4

data_file = open('/Users/markfang/Dropbox/UCSD Grad work/RNA-Yeo Lab/ipython test data/missing values 2.csv', 'rU')
data_table = np.genfromtxt(data_file, delimiter=',')

print(data_table)

numrows = data_table.shape[0]
numcols = data_table.shape[1]

for j in range(numcols):
    for i in range(numrows):
        #find missing values by checking for 'nan' value in ndarray
        if np.isnan(data_table[i][j]):
            #for determining Euclidean distances from the small molecule compound which is
            #missing value in 1 replicate to all the other vectors containing data for
            #all the other small molecule compounds,
            #we ignore the values in the replicate in which there is the missing value
            other_vectors = np.delete(data_table, j, axis = 1)
            
            #initialize the small molecule compound for which
            #there is a missing value for 1 replicate; this vector
            #will be used as the basis to compare Euclidean distance to the other_vectors
            vector_missingval = other_vectors[i:(i + 1), 0:numcols]
            
            #initialize a vector containing the data in the same repl in which there is
            #the missing value; the missing value will be imputed by a weighted average
            #of the other data in this replicate
            estimating_vals = data_table[0:numrows, j:(j + 1)]
            
            other_vectors = np.delete(other_vectors, i, axis = 0)
            estimating_vals = np.delete(estimating_vals, i, axis = 0)
            
            nan_estimator_flag = 1
            
            #remove other rows in which the estimating value is also missing
            while(nan_estimator_flag == 1):
                nan_estimator_flag = 0
                #print(len(estimating_vals))
                for x in range(len(estimating_vals)):
                    #print(1)
                    if (np.isnan(estimating_vals[x])):
                        nan_estimator_flag = 1
                        row_with_nan = x
                        #print(row_with_nan)
                
                if(nan_estimator_flag == 1):
                    other_vectors = np.delete(other_vectors, row_with_nan, axis = 0)
                    estimating_vals = np.delete(estimating_vals, row_with_nan, axis = 0)
                    #print(other_vectors)
                    #print(estimating_vals)
            
            #the following vector will store the Euclidean distances
            euclid_dist = np.zeros((other_vectors.shape[0], 1))

            for k in range(other_vectors.shape[0]):
                flag_nanvector = 0  
                
                for l in range(other_vectors.shape[1]):
                    if (np.isnan(other_vectors[k][l]) == False) and (np.isnan(vector_missingval[0][l]) == False):
                        #calculate Euclidean distance as defined in Troyanskaya et al (2001)
                        euclid_dist[k][0] += (other_vectors[k][l] - vector_missingval[0][l]) ** 2
                        
                        flag_nanvector = 1
                
                if flag_nanvector == 0:
                    #nans are always ranked last
                    euclid_dist[k][0] = data_table[i][j]
            
            #the similarity score calculated below and as defined in Troyanskaya et al (2001)
            #will serve as the weights for imputing the missing value based on weighted
            #average of the remaining - observed - data in the replicate missing the value
            similarity = 1 / euclid_dist
            
            #rank the other small molecule compounds in terms of their Euclidean distance
            #from the small molecule compound with the missing value.  select the k nearest
            #neighbors in terms of Euclidean distance
            simil_rank = scipy.stats.rankdata(euclid_dist, method = "ordinal")
            
            sum_simil = 0
            weighted_avg = 0
            for m in range(other_vectors.shape[0]):
                if simil_rank[m] <= K_NEAREST_NEIGHBORS:
                    sum_simil += similarity[m]
                    weighted_avg += similarity[m] * estimating_vals[m]
            data_table[i][j] = weighted_avg/sum_simil
            
            #print(data_table)

print(data_table)

[[  1.   2.   2.]
 [  2.  nan   4.]
 [  3.   6.   6.]
 [ 10.  11.  11.]
 [ 13.  nan  nan]
 [ -5.  -9.  -9.]
 [ -6.  -7.  -7.]
 [ nan  -8.  -8.]]
[[  1.           2.           2.        ]
 [  2.           4.00600462   4.        ]
 [  3.           6.           6.        ]
 [ 10.          11.          11.        ]
 [ 13.           9.75072189   9.8385891 ]
 [ -5.          -9.          -9.        ]
 [ -6.          -7.          -7.        ]
 [ -5.41641504  -8.          -8.        ]]
