## REDSEA python version 0.0.1

Translated from Yunhao Bai's MATLAB code by Bokai Zhu.

Some minor difference with Yunhao's MATLAB (subject to update in future version):

1. Does not filter the positive nuclear identity (cells) (because that part of code is in "mibisegmentByDeepProbWithPerm3.m"). But can be easily added by user.

2. Does not produce the sanity plot, since it should be outside of the compensation function. OPTIONAL add later

3. Does not produce FCS file at the end. Instead produce the 4 fcs file in a matrix style (pandas format), easier for later usage

In [1]:
# necessary packages
import PIL
from PIL import Image, ImageSequence, ImageOps
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import skimage 
import skimage.measure
import skimage.morphology
import glob
from scipy.io import loadmat
import time

In [2]:
# helper function 1

def ismember(a, b):
    bind = {}
    for i, elt in enumerate(b):
        if elt not in bind:
            bind[elt] = i
    return [bind.get(itm, None) for itm in a]  # None can be replaced by any other "not in b" value

# helper function 2

def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [3]:
# file locations

mainPath = '/home/bkzhu/paper/compensation/redseaPY_files/sampleData_MIBI' # main folder, redundant
massDS_path = mainPath + '/sampleData.csv' # csv file location, need
pathTiff = mainPath + '/originalTiff/Point1' #  tiff location, need, should be single folder with multiple channels
pathMat = mainPath + '/originalTiff/Point1/segmentationParams.mat'
#pathResults = mainPath + '/FCS_output' # output location, not needed

massDS = pd.read_csv(massDS_path) # read the mass csv
clusterChannels = massDS['Label'] # only get the label column
clusterChannelsInds=np.where(np.isin(clusterChannels,massDS['Label']))[0] #channel indexes

print(massDS.head())

        Label
0       dsDNA
1  Histone H3
2         CD4
3        CD56
4  CD21 (CR2)


In [4]:
# parameters for compensation

boundaryMod = 2 # 2 means boundary 
REDSEAChecker = 1 # 1 means subtract+ reinforce
elementShape = 2 # star, 1 == square size
elementSize = 2 # star or square extension size

In [5]:
#  select which channel to normalize

normChannels = ['CD4','CD56','CD21 (CR2)','CD163','CD68','CD3','CD20','CD8a']
#### should be inside the function
normChannelsInds = ismember(normChannels,massDS['Label'])
channelNormIdentity = np.zeros((len(massDS['Label']),1))
# make a flag for compensation
for i in range(len(normChannelsInds)):
    channelNormIdentity[normChannelsInds[i]] = 1 

In [6]:
#### this part is reading all tif files
# read in teh image and transform into a 'countsNoNoise' matrix

files=glob.glob(pathTiff+'/*.tif*') # read all the tiff/tif files under the folder
if "tiff" in files[0]:
    end=".tiff"
else:
    end=".tif"  
array_list=[]
for channel in clusterChannels:
    t=plt.imread(pathTiff+'/'+str(channel)+end)
    array_list.append(t)
countsNoNoise=np.stack(array_list,axis=2) # count matrices in the image
##### for this folder the matrix is read

In [7]:
# Define the boundary region

#### these code is just entire translation of redsea matlab v1.0
segMat = loadmat(pathMat)
labelNum = np.max(segMat['newLmod']) # how many labels
stats = skimage.measure.regionprops(segMat['newLmod']) # get the regional props for all the labels
newLmod=segMat['newLmod']
##### stuff related to mat finisehd
channelNum = len(clusterChannels) # how many channels

In [8]:
### make empty container matrices

data = np.zeros((labelNum,channelNum))
dataScaleSize = np.zeros((labelNum,channelNum))
cellSizes = np.zeros((labelNum,1))

In [9]:
# this part extract counts data from the whole cell regions, for each individual cells etc

for i in range(labelNum): # for each cell (label)
    label_counts=[countsNoNoise[coord[0],coord[1],:] for coord in stats[i].coords] # all channel count for this cell
    data[i,0:channelNum] = np.sum(label_counts, axis=0) #  sum the counts for this cell
    dataScaleSize[i,0:channelNum] = np.sum(label_counts, axis=0) / stats[i].area # scaled by size
    cellSizes[i] = stats[i].area # cell sizes

In [10]:
# now we start the boundary compensation part
# part of code in MATLAB function:
# MIBIboundary_compensation_boundarySA(newLmod,data,countsNoNoise,channelNormIdentity,elementShape,elementSize,REDSEAChecker)

# this block is for computing cell cell matrix

# what it needs 1 newLmod (segmentation mask matrix)
# countsNoNoiseoise (counts matrix, row, col, channel)
# normIdentity (plots, leave out for now)
# elmentshape, elementsize
# REDSEAChecker: subtract, reinforce selection

[rowNum, colNum] = newLmod.shape
cellNum=labelNum
cellPairMap = np.zeros((cellNum,cellNum)) # cell-cell shared perimeter matrix container

## need to add border to the segmentation mask (newLmod in this case)
newLmod_border = np.pad(newLmod, pad_width=1, mode='constant', constant_values=0)

# start looping the mask and produce the cell-cell contact matrix
for i in range(rowNum):
    for j in range(colNum):
        if newLmod[i,j] == 0:
            tempMatrix = newLmod_border[i:i+3,j:j+3] # the 3x3 window, xy shifted +1 due to border
            tempFactors = np.unique(tempMatrix) #unique
            tempFactors = tempFactors-1 # minus one for python index
            if len(tempFactors)==3: # means only two cells
                cellPairMap[tempFactors[1],tempFactors[2]] = cellPairMap[tempFactors[1],tempFactors[2]]+1 # count zero
            elif len(tempFactors)==4: # means three cells, three pairs
                cellPairMap[tempFactors[1],tempFactors[2]] = cellPairMap[tempFactors[1],tempFactors[2]]+1 # count zero
                cellPairMap[tempFactors[1],tempFactors[3]] = cellPairMap[tempFactors[1],tempFactors[3]]+1 # count zero
                cellPairMap[tempFactors[2],tempFactors[3]] = cellPairMap[tempFactors[2],tempFactors[3]]+1 # count zero
            elif len(tempFactors)==5: # means four cells, 6 pairs
                cellPairMap[tempFactors[1],tempFactors[2]] = cellPairMap[tempFactors[1],tempFactors[2]]+1 # count zero
                cellPairMap[tempFactors[1],tempFactors[3]] = cellPairMap[tempFactors[1],tempFactors[3]]+1 # count zero
                cellPairMap[tempFactors[1],tempFactors[4]] = cellPairMap[tempFactors[1],tempFactors[4]]+1 # count zero
                
                cellPairMap[tempFactors[2],tempFactors[3]] = cellPairMap[tempFactors[2],tempFactors[3]]+1 # count zero
                cellPairMap[tempFactors[2],tempFactors[4]] = cellPairMap[tempFactors[2],tempFactors[4]]+1 # count zero
                
                cellPairMap[tempFactors[3],tempFactors[4]] = cellPairMap[tempFactors[3],tempFactors[4]]+1 # count zero

In [11]:
# formatting of the cell cell maps

# double direction
cellPairMap = cellPairMap + np.transpose(cellPairMap)

###############
cellBoundaryTotal = np.sum(cellPairMap,axis=0) # count the boundary
############### this step might cause error in ark version, double check with YH

# devide to get fraction
cellBoundaryTotalMatrix = np.tile(cellBoundaryTotal,(cellNum,1))
#cellBoundaryTotalMatrix = repmat(cellBoundaryTotal',[1 cellNum]);
cellPairNorm = REDSEAChecker*np.identity(cellNum) - cellPairMap/cellBoundaryTotalMatrix
cellPairNorm = np.transpose(cellPairNorm) ### this is a werid bug in python, need to transpose
# now starts the calculation of signals from pixels along the boudnary of cells
MIBIdataNearEdge1 = np.zeros((cellNum,channelNum))

In [12]:
# start the boundary region selection and count extraction

##### A List of Items
items = list(range(cellNum))
l = len(items)
printProgressBar(0, l, prefix = 'Progress:', suffix = 'Complete', length = 50) # progress bar
#####

######pre-calculated shape
if elementShape==1: # square
    square=skimage.morphology.square(2*elementSize+1)
    square_loc=np.where(square==1)
elif elementShape==2: # diamond
    diam=skimage.morphology.diamond(elementSize) # create diamond shapte based on elementSize
    diam_loc=np.where(diam==1)
else:
    print("Error elementShape Value not recognized.")
############

for i in range(cellNum):
    label=i+1 # python problem
    [tempRow,tempCol] = np.where(newLmod==label)
    # sequence in row not col, should not affect the code
    for j in range(len(tempRow)):
        label_in_shape=[] # empy list in case
        # make sure not expand outside
        if (elementSize-1<tempRow[j]) and (tempRow[j]<rowNum-elementSize-2) and (elementSize-1<tempCol[j]) and (tempCol[j]<colNum-elementSize-2):
            ini_point = [tempRow[j]-elementSize,tempCol[j]-elementSize] # corrected top-left point
        
            if elementShape==1: # square
                square_loc_ini_x=[item + ini_point[0] for item in square_loc[0]]
                square_loc_ini_y=[item + ini_point[1] for item in square_loc[1]]
                
                label_in_shape=[newLmod[square_loc_ini_x[k],square_loc_ini_y[k]] for k in range(len(square_loc_ini_x))]
                
            elif elementShape==2: # diamond
                diam_loc_ini_x=[item + ini_point[0] for item in diam_loc[0]]
                diam_loc_ini_y=[item + ini_point[1] for item in diam_loc[1]]
                # finish add to ini point
            
                label_in_shape=[newLmod[diam_loc_ini_x[k],diam_loc_ini_y[k]] for k in range(len(diam_loc_ini_x))]
            
        if 0 in label_in_shape:
            MIBIdataNearEdge1[i,:] = MIBIdataNearEdge1[i,:] + countsNoNoise[tempRow[j],tempCol[j],:]
    
    # Update Progress Bar
    printProgressBar(i + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
        

Progress: |██████████████████████████████████████████████████| 100.0% Complete


In [13]:
## fome final formatting

MIBIdataNorm2 = np.transpose(np.dot(np.transpose(MIBIdataNearEdge1),cellPairNorm))
#this is boundary signal subtracted by cell neighboor boundary
MIBIdataNorm2 = MIBIdataNorm2 + data # reinforce onto the whole cell signal (original signal)
MIBIdataNorm2[MIBIdataNorm2<0] = 0 # clear out the negative ones
# flip the channelNormIdentity for calculation
rev_channelNormIdentity=np.ones_like(channelNormIdentity)-channelNormIdentity
# composite the normalized channels with non-normalized channels
# MIBIdataNorm2 is the matrix to return
MIBIdataNorm2 = data * np.transpose(np.tile(rev_channelNormIdentity,(1,cellNum))) + MIBIdataNorm2 * np.transpose(np.tile(channelNormIdentity,(1,cellNum)))
# scale by size
dataCompenScaleSize = MIBIdataNorm2 / cellSizes
# some last steps
############ SKIP THE POSITIVE NUCLEAR IDENTITY FILTER
############ SHOULD ADD by user's choice

labelIdentityNew2=np.ones(cellNum) ####### this part is the skipped line
sumDataScaleSizeInClusterChannels = np.sum(dataScaleSize[:,clusterChannelsInds],axis=1) # add all the cluster channels
labelIdentityNew2[sumDataScaleSizeInClusterChannels<0.1] = 2 # remove the cells that does not have info in cluster channels
# the function should return 4 varaibles
# matrix 
dataCells = data[labelIdentityNew2==1,:]
dataScaleSizeCells = dataScaleSize[labelIdentityNew2==1,:]
dataCompenCells = MIBIdataNorm2[labelIdentityNew2==1,:]
dataCompenScaleSizeCells = dataCompenScaleSize[labelIdentityNew2==1,:]


In [14]:
# create the final matrixs ( 4 types of them)

labelVec = np.where(labelIdentityNew2==1)
labelVec = [item + 1 for item in labelVec ] # python indexing difference need to add 1

# get cell sizes
cellSizesVec = cellSizes[labelIdentityNew2==1]
cellSizesVec_flat = [item for sublist in cellSizesVec for item in sublist] # flat the list

# produce the matrices

## first dataframe
dataL = pd.DataFrame({'cell_label':labelVec[0].tolist(), 'cell_size':cellSizesVec_flat})
dataCells_df=pd.DataFrame(dataCells)
dataCells_df.columns = clusterChannels
dataL_full = pd.concat((dataL,dataCells_df),axis=1)
### second
dataScaleSizeL_df=pd.DataFrame(dataScaleSizeCells)
dataScaleSizeL_df.columns = clusterChannels
dataScaleSizeL_full = pd.concat((dataL,dataScaleSizeL_df),axis=1)
### third
dataCompenL_df=pd.DataFrame(dataCompenCells)
dataCompenL_df.columns = clusterChannels
dataCompenL_full = pd.concat((dataL,dataCompenL_df),axis=1)
### forth
dataCompenScaleSizeL_df=pd.DataFrame(dataCompenScaleSizeCells)
dataCompenScaleSizeL_df.columns = clusterChannels
dataCompenScaleSizeL_full = pd.concat((dataL,dataCompenScaleSizeL_df),axis=1)

In [15]:
dataL_full.head() # orignal counts extracted from tiff files

Unnamed: 0,cell_label,cell_size,dsDNA,Histone H3,CD4,CD56,CD21 (CR2),Pax5,CD163,CD68,CD3,CD20,CD8a
0,1,22.0,0.0,21.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,28.0,0.0
1,3,58.0,142.0,95.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
3,6,74.0,178.0,1872.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,40.0,172.0,1391.0,10.0,0.0,0.0,0.0,0.0,0.0,28.0,0.0,4.0


In [16]:
dataScaleSizeL_full.head() # orignal counts extracted from tiff files, but scaled by cell size

Unnamed: 0,cell_label,cell_size,dsDNA,Histone H3,CD4,CD56,CD21 (CR2),Pax5,CD163,CD68,CD3,CD20,CD8a
0,1,22.0,0.0,0.954545,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,1.272727,0.0
1,3,58.0,2.448276,1.637931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5,0.0
3,6,74.0,2.405405,25.297297,0.040541,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,40.0,4.3,34.775,0.25,0.0,0.0,0.0,0.0,0.0,0.7,0.0,0.1


In [17]:
dataCompenL_full.head() # redsea compensated counts

Unnamed: 0,cell_label,cell_size,dsDNA,Histone H3,CD4,CD56,CD21 (CR2),Pax5,CD163,CD68,CD3,CD20,CD8a
0,1,22.0,0.0,21.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,29.568627,0.0
1,3,58.0,142.0,95.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,74.0,178.0,1872.0,0.723153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,40.0,172.0,1391.0,13.409155,0.0,0.0,0.0,0.0,0.0,35.869332,0.0,3.454849


In [18]:
dataCompenScaleSizeL_full.head() # redsea compensated counts, but scaled by cell size

Unnamed: 0,cell_label,cell_size,dsDNA,Histone H3,CD4,CD56,CD21 (CR2),Pax5,CD163,CD68,CD3,CD20,CD8a
0,1,22.0,0.0,0.954545,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,1.344029,0.0
1,3,58.0,2.448276,1.637931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,74.0,2.405405,25.297297,0.009772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,40.0,4.3,34.775,0.335229,0.0,0.0,0.0,0.0,0.0,0.896733,0.0,0.086371
