In [6]:
#BIOL51000 Fall 2, 2020
#Week 4 Assignment – Array Analysis

In [7]:
#Import necessary components
import sys, os
from PIL import Image
from numpy import array, dstack, transpose, uint8, zeros, log2

In [8]:
#=====================Define Function to copy existing PIL image data into a numpy array===========================

def imageToPixmapRGB(img):
    
    img2 = img.convert('RGB')  #Check image is RGB type
    w, h = img2.size
    data = img2.getdata()  #Copy image data into array
    pixmap1 = array(data, uint8)
    pixmap1 = pixmap1.reshape((h,w,3))
    return(pixmap1)

#====================================End imageToPixmapRGB definition===============================================

In [9]:
#=================================Define Function to Load the Array Image==========================================

def loadArrayImage(fileName, sampleName, nRows, nCols=None):

  #If number of columns is not specified, set nCols equal to nRows  
  if not nCols:
    nCols = nRows

  #Create a numeric matrix to contain signal info, initially filled with zeros
  dataMatrix = zeros((3, nRows, nCols), float)

  #Generate object representing image and convert to array
  img = Image.open(fileName)
  pixmap = imageToPixmapRGB(img)

  height, width, depth = pixmap.shape

  #Calculate floating point grid sizes
  dx = width/float(nCols)
  dy = height/float(nRows)

  #Calculate integer sizes
  xSize = 1 + (width-1)//nCols
  ySize = 1 + (height-1)//nRows

  #Loop through each row and calculate each pixel position, step 1 of grid creation
  for row in range(nRows):
    yStart = int(row*dy)
    yEnd   = yStart + ySize

    #Loop through each column and calculate each pixel position, step 2 of grid creation
    for col in range(nCols):
      xStart = int(col*dx)
      xEnd   = xStart + xSize

      #Calculate total signal for each grid element and store in dataMatrix
      elementData = pixmap[yStart:yEnd,xStart:xEnd]
      dataMatrix[:,row, col] = elementData.sum(axis=(0,1))

  #Return Microarray object
  return Microarray(sampleName, dataMatrix)

#========================================End loadArrayImage Definition============================================

In [10]:
#========================================Define Microarray Class==================================================

class Microarray(object):

  #Define constructor function
  def __init__(self, name, data, rowData=None, colData=None):

   #Store name for microarray data and make a copy of the input data
    self.name = name 
    data = array(data)

    #Extract size of axes in the array data
    shape = data.shape
    
    if len(shape) == 3:  #For 3 axes
      self.nChannels, self.nRows, self.nCols = shape
    
    elif len(shape) == 2:  #For 2 axes
      self.nRows, self.nCols = shape
      self.nChannels = 1
      data = array([data])

    else:  #otherwise raise exception
      raise Exception('Array data must have either 2 or 3 axes.')  

    #Ensure three axes, tying it to the object
    self.data = data
    
    #Create a copy of the original data which will be left in its original form
    self.origData = array(data)
    
    #Associate row and column labels with the object
    self.rowData = rowData or range(self.nRows)
    self.colData = colData or range(self.nCols)

  #Define a function to make an image
  def makeImage(self, squareSize=20, channels=None):
    
    #Find data extremes and calculate range
    minVal = self.data.min()
    maxVal = self.data.max() 
    dataRange = maxVal - minVal  

    #Create adjusted array, a copy of the self.data value array with lower limit subtracted and scaled so upper limit is 255
    adjData = (self.data - minVal) * 255 / dataRange
    adjData = array(adjData, uint8)
   
    #If array values not passed in, set defaults
    if not channels:
      if self.nChannels == 1:
        channels = (0,0,0) # Greyscale
      else:
        channels = list(range(self.nChannels))[:3]

    #Allow for blank color channels
    pixmap = []
    for i in channels:
      if i is None:
        pixmap.append(zeros((self.nRows, self.nCols), uint8))
      else:
        pixmap.append(adjData[i])
    
    #Allow for channel to be shorter than 3
    while len(pixmap) < 3:
      pixmap.append(zeros((self.nRows, self.nCols), uint8))
    
    #Stack the three color layers and use PIL module to make image
    pixmap = dstack(pixmap)
    img = Image.fromarray(pixmap, 'RGB')
    
    #Resize image
    width = self.nCols * squareSize
    height = self.nRows * squareSize
    img = img.resize((width, height), resample=Image.NEAREST, box=None) #THIS IS WHAT WAS CAUSING THE HEADACHES!
    #WHEN RESIZING THE IMAGE, THE PILLOW LIBRARY WAS RESAMPLING USING A DIFFERENT METHOD THAN PREVIOUSLY, IF WE CHANGE
    #TO Image.NEAREST (a nearest neighbor technique), WE ONCE AGAIN HAVE A SHARP IMAGE
            
    #Return image
    return img

  #Define a function to clip lowest base value of the data so it doesn't drop below specified threshold
  def clipBaseline(self, threshold=None, channels=None, defaultProp=0.2):
    
    #Specify channels to state which layers of the array are to be considered
    if not channels:
      channels = range(self.nChannels)
    
    #Convert channels to a tuple
    channels = [tuple(channels)]
    
    #Find maximum value and if threshold value is not specified calculate with default proportion
    maxVal = self.data[tuple(channels)].max()
    if threshold is None:
      limit = maxVal * defaultProp
    else:
      limit = threshold
    
    #Generate Boolean area stating whether each element was less than the threshold value
    boolArray = self.data[tuple(channels)] < limit
    
    #Create an indices of the positions where Boolean array is True
    indices = boolArray.nonzero()
    
    #Set the array elements corresponding to these indices to the lower limit
    self.data[indices] = limit

    #Center data by subtracting the baseline val to a yield a new base val of zero and scale to restore original maxval
    self.data[tuple(channels)] -= limit
    self.data[tuple(channels)] *= maxVal / (maxVal-limit)

  #Define a function to scale the values to some upper limit
  def normaliseMax(self, scale=1.0, perChannel=True):
    
    #Consider each data layer separately
    if perChannel:
      for i in range(self.nChannels):
        self.data[i] = self.data[i] * scale / self.data[i].max()
    
    #Use maximum value from all the data
    else:
      self.data = self.data * scale / self.data.max()
    
  #Define a function to convert the data values into a logarithmic scale
  def normaliseLogMean(self):

    #Clip baseline to remove any negative values
    self.clipBaseline(threshold=0.0)
    #Add 1.0 so that logarithm is not taken of any zero values
    for i in range(self.nChannels):
      self.data[i] = log2( 1.0 + self.data[i] / self.data[i].mean() )


  #Define a function to check data size
  def checkDataSize(self, channelData):
    
    #Trigger exception if input for a layer is not the same size as the existing array data
    channelData = array(channelData)
    if channelData.shape != (self.nRows, self.nCols):
      msg = 'Attempt use data of wrong size'
      raise Exception(msg)
  
    return channelData

  #Define function to set channel to replace all the data for an array layer specified at a given existing index
  def setChannel(self, channelData, index=0):

    channelData = self.checkDataSize(channelData)
    self.data[index] = channelData
    
  def addChannel(self, channel_data):
    channel_data = self.check_data_size(channel_data)
    self.data = append(self.data, channel_data, axis = 0)
    self.channel_count += 1
    
  #Define a function to combine two channels specified by indices
  def combineChannels(self, indexA, indexB, combFunc=None, replace=None):
    
    #If no function has been passed specifying how layers should be combined
    if not combFunc:
      import operator
      combFunc = operator.add 

    channelData = combFunc(self.data[indexA], self.data[indexB])

    #State which layer the new, combined data should be place in
    if replace is None:
      self.addChannel(channelData)
    else:
      self.setChannel(channelData, replace)
  
  #Define hierarchical row cluster function
  def __hierarchicalRowCluster(self, dataMatrix):
    
    #Import necessary code from files
    from SeqVariation import neighbourJoinTree

    #Create a distanceMatrix, initially full of zeros
    n = len(dataMatrix[0])
    distanceMatrix = zeros((n, n), float)
    
    #Loop through each layer and row in dataMatrix and subtract row from whole array yielding differences to row
    for channelData in dataMatrix:
      for i, row in enumerate(channelData):
        diffs = channelData - row
        sqDiffs = diffs * diffs  #Square the difference
        sqDists = sqDiffs.sum(axis=1)  #Add the squared differences (sum of squares)
        distanceMatrix[i,:] += sqDists  #Place results in distanceMatrix

    #Create a hierarchical tree from the distanceMatrix
    tree, joinOrder = neighbourJoinTree(distanceMatrix.tolist())

    #Initialize rowOrder list as a copy of the tree
    rowOrder = list(tree)

    #Record an index i to represent the position in the list which is being processed
    i  = 0
    while i < len(rowOrder):
        
      #Check whether the list items at postition i is an integer, if not insert contents of sub-list into main list
      while not isinstance(rowOrder[i], int):
        rowOrder[i:i+1] = rowOrder[i]
    
      i += 1
  
    return rowOrder

  #Define hierarchical cluster function
  def hierarchicalCluster(self):
    
    #Cluster the rows in self.data
    rows = self.__hierarchicalRowCluster(self.data)

    #Reorder the array according to the row hierarchy
    swapped = transpose(self.data, axes=(0,2,1))
    
    #Transpose data array, clustering the columns
    cols = self.__hierarchicalRowCluster(swapped)

    #Use reordered rows and cols as indices to shuffle the data, according to the hierarchical clustering
    data = self.data[:,rows] # Rearrange
    data = data[:,:,cols]
    
    data = array(data.tolist()) # to fix PIL.Image bug
    
    name = self.name + '-Sorted'
    rowData = [self.rowData[i] for i in rows]
    colData = [self.colData[j] for j in cols]

    #Create a new Microarray object with different order for its rows and columns
    sortedArray = Microarray(name, data, rowData, colData)
    
    #Return sorted array
    return sortedArray 

#Define log2Ratio function
def log2Ratio(data1, data2):
    
    #Shift arrays away from zero by a small amount so that zeros do not occur in division
    data1 = array(data1) + 1e-3
    data2 = array(data2) + 1e-3

    #Calculate log base 2(x/y)
    return log2(data1/data2)
#======================================End Microarray Class Definition======================================


In [11]:
#========================Load image file, load data into array and write data to a text file=================
imgFile = input("Enter file path and name: ")  #Obtain image file path from user
rgArray = loadArrayImage(imgFile, 'TwoChannel', 18, 17)  #Load data from image into array
#============================================================================================================

Enter file path and name: examples/RedGreenArray.png


In [13]:
#=================Compare two signal-instensity arrays by taking one away from the other=====================
#Calculate the array of differences
diff = rgArray.data[0]-rgArray.data[1]
#Store the differences in the first two color channels
rgArray.setChannel(diff, 0)
rgArray.setChannel(-diff, 1)  #Flip the sign for the green channel
rgArray.clipBaseline(threshold=0.0, channels=(0,1))  #Clip the values at 0.0 to remove negative values
rgArray.makeImage(20).save('RGArray.png') #Save a copy of the image
img = Image.open('RGArray.png')
img.show() #Display resulting image
#=============================================================================================================

In [14]:
#=Combine two-channel red and green array data with the log(base 2) of the ratio of the red and green channels=
rgArray.combineChannels(0, 1, combFunc=log2Ratio, replace=2)
rgArray.normaliseMax(perChannel=True)
rgArray.makeImage(20, channels=(2,2,2)).show()
#==============================================================================================================

In [15]:
#===========Search each row in a 2D array data to generate a matrix of differences to this row=================
#==========================(squared and then square-rooted values are replaced)================================
sortedArray = rgArray.hierarchicalCluster()
sortedArray.makeImage(20).show()

#Print original array data and sorted array data
print(rgArray.rowData)
print(sortedArray.rowData)
#===============================================================================================================
# need to add the programs for calling

SyntaxError: Missing parentheses in call to 'print'. Did you mean print('Executable file %s not found' % formatdbExe)? (Alignments.py, line 228)

In [14]:
#======================Normalize data values for fluorescence intensity data using a log scale==================
rgArray.normaliseLogMean()
rgArray.makeImage(25).save('normaliseLogMean.png')
fluor = Image.open('normaliseLogMean.png')
fluor.show()
#================================================================================================================