## Description

In this notebook, I compare the accuracy of 4 approaches: either peak-based or regression-based and either K-means or spatialDGMM.  

## Data & module imports

In [2]:
%matplotlib notebook

In [3]:
import imageio
import numpy as np
import numpy.random as rd 
from masserstein import Spectrum, estimate_proportions
from matplotlib import cm
from matplotlib import pyplot as plt
from pyimzml.ImzMLWriter import ImzMLWriter
from pyimzml.ImzMLParser import ImzMLParser
from copy import deepcopy
from scipy.stats import nbinom, pearsonr
from sklearn.cluster import KMeans

Loading background image:

In [4]:
mask = imageio.imread('SimulationMask.png')
mask.shape

(40, 40, 4)

In [5]:
region1 = mask[:,:,1] == 128
region2 = mask[:,:,1] == 46
region3 = mask[:,:,1] == 130
region4 = mask[:,:,1] == 53

Names of lipids:

In [6]:
lipid_names = ['PC(38:1)', 'PA(44:0)', 'PC(38:0)']

Regions of monoisotopic peaks to integrate to get peak areas:

In [7]:
integration_regions = [
    (854.4, 854.8),
    (855.4, 855.8),
    (856.4, 856.8)
]

Regions enriched in lipids:

In [8]:
enrichment_mask = np.zeros(mask.shape[:2] + (len(lipid_names),), dtype='bool')
enrichment_mask[...,0] = region1 + region3
enrichment_mask[...,1] = region2 + region4
enrichment_mask[...,2] = region3 + region4

Loading profile image:

In [9]:
MSI_profile = ImzMLParser('lipid_MSI_profile_mode.imzML')

  warn(


Calculating monoisotopic peak signals:

In [10]:
monoisotopic_peak_areas = np.zeros(mask.shape[:-1] + (3,))
for idx, (xcoord,ycoord,zcoord) in enumerate(MSI_profile.coordinates):
    mz, intsy = MSI_profile.getspectrum(idx)
    mz, intsy = np.array(mz), np.array(intsy)
    for region_id, region in enumerate(integration_regions):
        to_take = (mz >= region[0])*(mz <= region[1])
        area = np.trapz(intsy[to_take], mz[to_take])
        monoisotopic_peak_areas[ycoord, xcoord, region_id] = area

Loading deconvolution results:

In [11]:
deconvolution_image = np.zeros(mask.shape[:-1] + (3,))
with open('lipid_regression_results.tsv') as h:
    next(h)
    for l in h:
        l = l.strip().split('\t')
        x, y = map(int, l[:2])
        deconvolution_image[y, x, ...] = list(map(float, l[2:]))

Loading Dan's clustering results:

In [12]:
peak_kmeans_segments = np.zeros(mask.shape[:-1] + (3,))
with open('km_simulation.csv') as h:
    next(h) # skip the header
    for l in h:
        l = l.split(',')[1:]
        l = list(map(int, l))
        peak_kmeans_segments[l[1],l[0],...] = l[2:]
peak_kmeans_segments -= 1  # to get a binary classification

In [13]:
peak_sdgmm_segments = np.zeros(mask.shape[:-1] + (3,))
with open('sdgmm_simulation.csv') as h:
    next(h) # skip the header
    for l in h:
        l = l.split(',')[1:]
        l = list(map(int, l))
        peak_sdgmm_segments[l[1],l[0],...] = l[2:]
peak_sdgmm_segments -= 1   # to get a binary classification

In [25]:
lipid_sdgmm_segments = np.zeros(mask.shape[:-1] + (3,))
with open('sdgmm_simulation_deconv_5r.csv') as h:
    next(h) # skip the header
    for l in h:
        l = l.split(',')[1:]
        l = list(map(int, l))
        lipid_sdgmm_segments[l[1],l[0],...] = l[2:]
lipid_sdgmm_segments -= 1   # to get a binary classification

Do a k-means clustering of peaks for data validation:

In [15]:
nclust = 2
peak_kmeans_segments_replicated = np.zeros(mask.shape[:-1] + (3,))
for lipid_id, lipid_name in enumerate(lipid_names):
    peak_intensities = monoisotopic_peak_areas[...,lipid_id]
    peak_intensities = peak_intensities.reshape((-1,1))
    peak_kmeans = KMeans(n_clusters=nclust)
    region_classification = peak_kmeans.fit_predict(peak_intensities)
    region_classification = region_classification.reshape(40, 40)
    peak_intensities = peak_intensities.reshape((40, 40))
    region_classification = np.argsort(np.argsort(peak_kmeans.cluster_centers_.reshape((-1,))))[region_classification]
    peak_kmeans_segments_replicated[..., lipid_id] = region_classification 

Replicate a k-means clustering of lipid signals:

In [16]:
nclust = 2
signal_kmeans_segments_replicated = np.zeros(mask.shape[:-1] + (3,))
for lipid_id, lipid_name in enumerate(lipid_names):
    peak_intensities = deconvolution_image[...,lipid_id]
    peak_intensities = peak_intensities.reshape((-1,1))
    peak_kmeans = KMeans(n_clusters=nclust)
    region_classification = peak_kmeans.fit_predict(peak_intensities)
    region_classification = region_classification.reshape(40, 40)
    peak_intensities = peak_intensities.reshape((40, 40))
    region_classification = np.argsort(np.argsort(peak_kmeans.cluster_centers_.reshape((-1,))))[region_classification]
    signal_kmeans_segments_replicated[..., lipid_id] = region_classification 

## Data visualization

In [17]:
plt.figure(figsize=(3,3))
plt.title('The reference image')
plt.imshow(mask)
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

Ion images, monoisotopic peak

In [19]:
plt.subplots(1, 3, figsize=(6,2))
for lipid_id, lipid_name in enumerate(lipid_names):
    plt.subplot(1, 3, 1 + lipid_id)
    plt.title(lipid_name + ' mono')
    plt.imshow(monoisotopic_peak_areas[:,:,lipid_id])
    plt.axis('off')
    plt.colorbar()
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

Ion images, deconvoluted

In [20]:
plt.subplots(1, 3, figsize=(6,2))
for lipid_id, lipid_name in enumerate(lipid_names):
    plt.subplot(1, 3, 1 + lipid_id)
    plt.title(lipid_name + ' deconv')
    plt.imshow(deconvolution_image[:,:,lipid_id])
    plt.axis('off')
    plt.colorbar()
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

K-means clustering (Dan), monoisotopic peak

In [21]:
plt.subplots(1, 3, figsize=(6,2))
for lipid_id, lipid_name in enumerate(lipid_names):
    plt.subplot(1, 3, 1 + lipid_id)
    plt.title(lipid_name + ' mono + KM')
    plt.imshow(peak_kmeans_segments[:,:,lipid_id])
    plt.axis('off')
    plt.colorbar()
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

Local K-means to check data consistency:

In [22]:
plt.subplots(1, 3, figsize=(6,2))
for lipid_id, lipid_name in enumerate(lipid_names):
    plt.subplot(1, 3, 1 + lipid_id)
    plt.title(lipid_name + ' mono + KM local')
    plt.imshow(peak_kmeans_segments_replicated[:,:,lipid_id])
    plt.axis('off')
    plt.colorbar()
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

Small differences as expected (probably local minimum), overall good consistency

spatialDGMM clustering (Dan), mono

In [23]:
plt.subplots(1, 3, figsize=(6,2))
for lipid_id, lipid_name in enumerate(lipid_names):
    plt.subplot(1, 3, 1 + lipid_id)
    plt.title(lipid_name + ' mono + sdgmm')
    plt.imshow(peak_sdgmm_segments[:,:,lipid_id])
    plt.axis('off')
    plt.colorbar()
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

spatialDGMM clustering (Dan), deconv (?)

In [26]:
plt.subplots(1, 3, figsize=(6,2))
for lipid_id, lipid_name in enumerate(lipid_names):
    plt.subplot(1, 3, 1 + lipid_id)
    plt.title(lipid_name + ' deconv + sdgmm')
    plt.imshow(lipid_sdgmm_segments[:,:,lipid_id])
    plt.axis('off')
    plt.colorbar()
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

## Accuracy computation

In [27]:
nclust = 2

peak_kmeans_accuracy = [0.]*len(lipid_names)
ws_kmeans_accuracy = [0.]*len(lipid_names)
peak_sdgmm_accuracy = [0.]*len(lipid_names)
ws_sdgmm_accuracy = [0.]*len(lipid_names)

peak_kmeans_correlation = [0.]*len(lipid_names)
ws_kmeans_correlation = [0.]*len(lipid_names)
peak_sdgmm_correlation = [0.]*len(lipid_names)
ws_sdgmm_correlation = [0.]*len(lipid_names)

for lipid_id, lipid_name in enumerate(lipid_names):
    # Peak K-means
    region_classification = peak_kmeans_segments[..., lipid_id]
    peak_kmeans_accuracy[lipid_id] = np.sum(region_classification * enrichment_mask[...,lipid_id])/np.sum(enrichment_mask[...,lipid_id])
    peak_kmeans_correlation[lipid_id] = pearsonr(region_classification.flatten(), enrichment_mask[...,lipid_id].flatten())[0]
    # Peak spatialDGMM
    region_classification = peak_sdgmm_segments[..., lipid_id]
    peak_sdgmm_accuracy[lipid_id] = np.sum(region_classification * enrichment_mask[...,lipid_id])/np.sum(enrichment_mask[...,lipid_id])
    peak_sdgmm_correlation[lipid_id] = pearsonr(region_classification.flatten(), enrichment_mask[...,lipid_id].flatten())[0]
    # WS K-means
    region_classification = signal_kmeans_segments_replicated[..., lipid_id]
    ws_kmeans_accuracy[lipid_id] = np.sum(region_classification * enrichment_mask[...,lipid_id])/np.sum(enrichment_mask[...,lipid_id])
    ws_kmeans_correlation[lipid_id] = pearsonr(region_classification.flatten(), enrichment_mask[...,lipid_id].flatten())[0]
    # WS spatialDGMM
    region_classification = lipid_sdgmm_segments[..., lipid_id]
    ws_sdgmm_accuracy[lipid_id] = np.sum(region_classification * enrichment_mask[...,lipid_id])/np.sum(enrichment_mask[...,lipid_id])
    ws_sdgmm_correlation[lipid_id] = pearsonr(region_classification.flatten(), enrichment_mask[...,lipid_id].flatten())[0]


In [28]:
print('Accuracy of segmentation:')
print('Lipid\tAlgorithm\tPeak\tWS')
for i, n in enumerate(lipid_names):
    print(n, 'K-means', peak_kmeans_accuracy[i], ws_kmeans_accuracy[i], sep='\t')
for i, n in enumerate(lipid_names):
    print(n, 'sDGMM', peak_sdgmm_accuracy[i], ws_sdgmm_accuracy[i], sep='\t')
print()
print('Correlation of segmentation:')
print('Lipid\tAlgorithm\tPeak\tWS')
for i, n in enumerate(lipid_names):
    print(n, 'K-means', round(peak_kmeans_correlation[i], 2), round(ws_kmeans_correlation[i], 2), sep='\t')
for i, n in enumerate(lipid_names):
    print(n, 'sDGMM', round(peak_sdgmm_correlation[i], 2), round(ws_sdgmm_correlation[i], 2), sep='\t')

Accuracy of segmentation:
Lipid	Algorithm	Peak	WS
PC(38:1)	K-means	0.79625	0.7975
PA(44:0)	K-means	0.0975	0.56375
PC(38:0)	K-means	0.645	0.6775
PC(38:1)	sDGMM	0.9975	0.99875
PA(44:0)	sDGMM	0.045	0.98875
PC(38:0)	sDGMM	0.5775	0.985

Correlation of segmentation:
Lipid	Algorithm	Peak	WS
PC(38:1)	K-means	0.81	0.81
PA(44:0)	K-means	-0.53	0.54
PC(38:0)	K-means	0.3	0.58
PC(38:1)	sDGMM	1.0	1.0
PA(44:0)	sDGMM	-0.62	0.99
PC(38:0)	sDGMM	0.34	0.94


## Joint visualization for the article

Ion images:

In [40]:
plt.subplots(2, 3, figsize=(6,3))
for lipid_id, lipid_name in enumerate(lipid_names):
    plt.subplot(2, 3, 1 + lipid_id)
    plt.title(lipid_name)
    plt.imshow(monoisotopic_peak_areas[:,:,lipid_id], cmap=cm.inferno, vmin=0)
    plt.axis('off')
    plt.colorbar()
    plt.subplot(2, 3, 4 + lipid_id)
    plt.imshow(deconvolution_image[:,:,lipid_id], cmap=cm.inferno, vmin=0)
    plt.axis('off')
    plt.colorbar()
plt.tight_layout()
plt.show()
plt.savefig('ion_images.png', dpi=600)

<IPython.core.display.Javascript object>

Segmentation:

In [29]:
plt.subplots(4, 3, figsize=(4,6))
for lipid_id, lipid_name in enumerate(lipid_names):
    plt.subplot(4, 3, 1 + lipid_id)
    plt.title(lipid_name, fontname='Arial')
    plt.imshow(peak_kmeans_segments[:,:,lipid_id])
    plt.axis('off')
    plt.subplot(4, 3, 4 + lipid_id)
    plt.imshow(peak_sdgmm_segments[:,:,lipid_id])
    plt.axis('off')
    plt.subplot(4, 3, 7 + lipid_id)
    plt.imshow(signal_kmeans_segments_replicated[:,:,lipid_id])
    plt.axis('off')
    plt.subplot(4, 3, 10 + lipid_id)
    plt.imshow(lipid_sdgmm_segments[:,:,lipid_id])
    plt.axis('off')
plt.tight_layout()
plt.show()
plt.savefig('segmentation_images.png', dpi=600)

<IPython.core.display.Javascript object>