## Description

In this notebook, we analyze the segmentation results.  

In [1]:
%matplotlib notebook

In [2]:
from masserstein import Spectrum, estimate_proportions
from matplotlib import cm
from matplotlib import pyplot as plt
import numpy as np
from pyimzml.ImzMLParser import ImzMLParser
from pyimzml.ImzMLWriter import ImzMLWriter
from time import time
import pandas as pd
from sklearn.cluster import KMeans
from additional_functions import profile_ion_image, centroided_ion_image, segment_average_spectra, total_spectrum

In [3]:
import numpy.random as rd

In [4]:
analyzed_mass_range = (600, 1000)

In [5]:
cerebellum_image_shape = (21, 81)
bladder_image_shape = (134, 260)

In [6]:
bladder_centroided_image = ImzMLParser('MSimages/bladder_centroided.imzML')
cerebellum_centroided_image = ImzMLParser('MSimages/cerebellum_centroided.imzML')

In [7]:
bladder_profile_image = ImzMLParser('MSimages/HR2MSI mouse urinary bladder S096.imzML')
cerebellum_profile_image = ImzMLParser('MSimages/test_POS.imzML')

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [8]:
centroided_images = {'cerebellum': cerebellum_centroided_image, 
                     'bladder': bladder_centroided_image}
profile_images = {'cerebellum': cerebellum_profile_image, 
                  'bladder': bladder_profile_image}
image_shapes = {'cerebellum': cerebellum_image_shape, 
                'bladder': bladder_image_shape}
image_names = ['cerebellum', 'bladder']

In [9]:
lipid_formulas = {n : [] for n in image_names}
for i, n in enumerate(image_names):
    with open(n + '_annotation_results.tsv') as h:
        next(h)  # skipping header
        for l in h:
            l = l.strip().split('\t')
            lipid_formulas[n].append(l[0])
    assert len(lipid_formulas[n]) == len(set(lipid_formulas[n])), 'Formulas for %s are not unique!' % n      
    print('Loaded %i formulas for %s' % (len(lipid_formulas[n]), n))

Loaded 207 formulas for cerebellum
Loaded 180 formulas for bladder


In [10]:
lipid_spectra = {n: [Spectrum(f, adduct='K', threshold=0.05) for f in lipid_formulas[n]] for n in image_names}
for n in image_names:
    for s in lipid_spectra[n]:
        s.normalize()
        assert analyzed_mass_range[0] <= s.confs[0][0] <= analyzed_mass_range[1]

Load the quantification results. We use 3D arrays to store the data. The first two dimentions denote spatial coordinates, the third one is the lipid ID.  
We will use the quantification results to filter lipids which are found in only a handful of pixels.

In [11]:
masserstein_images = {n : np.load(n+'_regression_images.dmp', allow_pickle=True) for n in image_names}
[masserstein_images[n].shape for n in image_names]

[(21, 81, 207), (134, 260, 180)]

In [12]:
nb_of_nonzero_pixels = {n: np.sum(masserstein_images[n]>0, axis=(0,1)) for n in image_names}
is_in_enough_pixels = {n: nb_of_nonzero_pixels[n] >= 400 for n in image_names}

In [13]:
for n in image_names:
    masserstein_images[n] = masserstein_images[n][..., is_in_enough_pixels[n]]
    # single_peak_intensities[n] = single_peak_intensities[n][..., is_in_enough_pixels[n]]
    lipid_formulas[n] = [f for f,t in zip(lipid_formulas[n], is_in_enough_pixels[n]) if t]
    lipid_spectra[n] = [f for f,t in zip(lipid_spectra[n], is_in_enough_pixels[n]) if t]
[masserstein_images[n].shape for n in image_names]

[(21, 81, 74), (134, 260, 52)]

Load the segmentation results. We use 3D arrays to store the data. The first two dimentions denote spatial coordinates, the third one is the ID of the cluster. 

In [14]:
single_peak_kmeans_segmentation = {n : np.load(n+'_peak_kmeans_segmentation.dmp', allow_pickle=True) for n in image_names}
[single_peak_kmeans_segmentation[n].shape for n in image_names]

[(21, 81, 74), (134, 260, 52)]

In [15]:
masserstein_kmeans_segmentation = {n : np.load(n+'_masserstein_kmeans_segmentation.dmp', allow_pickle=True) for n in image_names}
[masserstein_kmeans_segmentation[n].shape for n in image_names]

[(21, 81, 74), (134, 260, 52)]

In [16]:
masserstein_sdgmm_segmentation = {n : np.load(n+'_masserstein_sdgmm_segmentation.dmp', allow_pickle=True) for n in image_names}
[masserstein_sdgmm_segmentation[n].shape for n in image_names]

[(21, 81, 74), (134, 260, 52)]

Verify proper loading:

In [17]:
plt.figure()
plt.subplot(221)
plt.title('Masserstein+Kmeans')
plt.imshow(masserstein_kmeans_segmentation['bladder'][..., 34])
plt.axis('off')
plt.subplot(222)
plt.title('Masserstein+sDGMM')
plt.imshow(masserstein_sdgmm_segmentation['bladder'][..., 34])
plt.axis('off')
plt.subplot(223)
plt.imshow(masserstein_kmeans_segmentation['cerebellum'][..., 2])
plt.axis('off')
plt.subplot(224)
plt.imshow(masserstein_sdgmm_segmentation['cerebellum'][..., 2])
plt.axis('off')
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [18]:
lipid_spectra['cerebellum'][0].formula

'C43H78NO8P'

## Correlation between segmentation techniques

In [19]:
segmentation_agreement_percent = {n: np.zeros(len(lipid_formulas[n])) for n in image_names}
segmentation_correlation = {n: np.zeros(len(lipid_formulas[n])) for n in image_names}
for n in image_names:
    segmentation_agreement_percent[n] = np.mean(masserstein_sdgmm_segmentation[n] == masserstein_kmeans_segmentation[n], 
                                            axis=(0,1)) 
    mass_standard = masserstein_sdgmm_segmentation[n] - np.mean(masserstein_sdgmm_segmentation[n], axis=(0,1))
    mass_standard = mass_standard / np.std(mass_standard, axis=(0,1))
    kmean_standard = masserstein_kmeans_segmentation[n] - np.mean(masserstein_kmeans_segmentation[n], axis=(0,1))
    kmean_standard = kmean_standard / np.std(kmean_standard, axis=(0,1))
    segmentation_correlation[n] = np.mean(mass_standard*kmean_standard, axis=(0,1))

In [20]:
print('Average identity for cerebellum:', np.mean(segmentation_agreement_percent['cerebellum']))
print('Average identity for bladder:', np.mean(segmentation_agreement_percent['bladder']))
plt.figure(figsize=(8,4), dpi=100)
plt.subplot(121)
plt.title('Cerebellum')
plt.xlim(0,1)
plt.xlabel('Segmentation agreement')
plt.ylabel('Number of lipids')
plt.hist(segmentation_agreement_percent['cerebellum'], bins=np.linspace(0,1,num=21))
plt.subplot(122)
plt.title('Bladder')
plt.xlim(0,1)
plt.xlabel('Segmentation agreement')
plt.hist(segmentation_agreement_percent['bladder'], bins=np.linspace(0,1,num=21))
plt.tight_layout()
plt.savefig('Figs/segmentation/percent_agreement_hist.png', dpi=300)

Average identity for cerebellum: 0.9590145701256813
Average identity for bladder: 0.979834187052901


<IPython.core.display.Javascript object>

In [21]:
print(np.sum(segmentation_agreement_percent['cerebellum']>0.9))
print(np.sum(segmentation_agreement_percent['bladder']>0.9))

73
52


Check the lipids for which segmentation differs a lot:

In [22]:
cerebellum_test =np.argmin(segmentation_agreement_percent['cerebellum'])
plt.figure()
plt.subplot(311)
plt.title('Masserstein')
plt.imshow(masserstein_images['cerebellum'][..., cerebellum_test])
plt.subplot(312)
plt.title('Masserstein+Kmeans')
plt.imshow(masserstein_kmeans_segmentation['cerebellum'][..., cerebellum_test])
plt.axis('off')
plt.subplot(313)
plt.title('Masserstein+sDGMM')
plt.imshow(masserstein_sdgmm_segmentation['cerebellum'][..., cerebellum_test])
plt.axis('off')
plt.tight_layout()

<IPython.core.display.Javascript object>

In [23]:
print('Correlation for cerebellum:', np.mean(segmentation_correlation['cerebellum']))
print('Correlation for bladder:', np.mean(segmentation_correlation['bladder']))
plt.figure(figsize=(8,4), dpi=100)
plt.subplot(121)
plt.title('Cerebellum')
plt.xlim(0,1)
plt.xlabel('Segmentation agreement')
plt.ylabel('Number of lipids')
plt.hist(segmentation_correlation['cerebellum'])
plt.subplot(122)
plt.title('Bladder')
plt.xlim(0,1)
plt.xlabel('Segmentation agreement')
plt.hist(segmentation_correlation['bladder'])
plt.tight_layout()

Correlation for cerebellum: 0.9041521786797186
Correlation for bladder: 0.8971985841222412


<IPython.core.display.Javascript object>

In [24]:
np.argsort(segmentation_agreement_percent['bladder'])

array([27, 25, 47,  9, 13, 18, 30, 14,  5, 50, 42, 17, 36, 22, 37, 20, 43,
       11,  2,  7, 34, 31,  0, 35, 21, 23, 26, 24, 49, 41, 32, 10, 12, 45,
       46, 48, 44,  3, 40, 19,  6, 39, 28, 51, 16,  8, 38, 15, 33,  1, 29,
        4])

In [25]:
plt.figure(figsize=(8,4), dpi=100)
plt.subplot(121)
plt.title('Cerebellum')
plt.xlim(0,1)
plt.xlabel('Segmentation difference')
plt.ylabel('Number of lipids')
plt.hist(1-segmentation_agreement_percent['cerebellum'], bins=60)
plt.subplot(122)
plt.title('Bladder')
plt.xlim(0,1)
plt.xlabel('Segmentation difference')
plt.hist(1-segmentation_agreement_percent['bladder'], bins=60)
plt.tight_layout()

<IPython.core.display.Javascript object>

In [27]:
print(np.mean(1-segmentation_agreement_percent['cerebellum']))
print(np.quantile(1-segmentation_agreement_percent['bladder'], 0.9))

0.04098542987431877
0.061142365097588955


In [None]:
cer_diff = 1-segmentation_agreement_percent['cerebellum']
bld_diff = 1-segmentation_agreement_percent['bladder']

## Spatial consistency 

In [32]:
sdgmm_consistency_scores = {n: np.zeros(len(lipid_formulas[n])) for n in image_names}
kmeans_consistency_scores = {n: np.zeros(len(lipid_formulas[n])) for n in image_names}
for n in image_names:
    t = masserstein_sdgmm_segmentation[n]
    r = masserstein_kmeans_segmentation[n]
    for sh1 in [-1, 0, 1]:
        for sh2 in [-1, 0, 1]:
            if sh1 == 0 and sh2 == 0: continue
            sdgmm_consistency_scores[n] += np.sum(t == np.roll(t, (sh1, sh2), axis=(0, 1)), axis=(0, 1))
            kmeans_consistency_scores[n] += np.sum(r == np.roll(r, (sh1, sh2), axis=(0, 1)), axis=(0, 1))
    sdgmm_consistency_scores[n] /= (8*image_shapes[n][0]*image_shapes[n][1])
    kmeans_consistency_scores[n] /= (8*image_shapes[n][0]*image_shapes[n][1])

In [33]:
sdgmm_consistency_scores[n]

array([0.90186567, 0.97439724, 0.85228186, 0.95203789, 0.97887486,
       0.92128301, 0.94147532, 0.83464409, 0.96123708, 0.85549656,
       0.91640356, 0.8401837 , 0.97864524, 0.92946326, 0.8852899 ,
       0.92946326, 0.94925373, 0.84922503, 0.85828071, 0.95638634,
       0.91650402, 0.95624282, 0.85625718, 0.86950344, 0.92402411,
       0.86288749, 0.93206085, 0.9195465 , 0.96195465, 0.97590413,
       0.84031286, 0.94553674, 0.9060132 , 0.96206946, 0.93843284,
       0.84744546, 0.88935132, 0.9119977 , 0.9565729 , 0.96740815,
       0.9587256 , 0.91555683, 0.85133467, 0.84839265, 0.93920781,
       0.97204363, 0.96601607, 0.87139782, 0.9494403 , 0.88349598,
       0.85519518, 0.96384902])

In [34]:
kmeans_consistency_scores[n]

array([0.90924225, 0.97547359, 0.87045063, 0.95799369, 0.97937715,
       0.93132893, 0.94437428, 0.84631171, 0.96504018, 0.81514064,
       0.92656429, 0.85503731, 0.98904994, 0.87191447, 0.90605626,
       0.93252009, 0.95251148, 0.87751148, 0.82864524, 0.95914179,
       0.9439868 , 0.97604765, 0.88085534, 0.88055396, 0.93170207,
       0.79867968, 0.93376866, 0.85806544, 0.9668054 , 0.97709529,
       0.83958094, 0.95044489, 0.91400689, 0.96608783, 0.95723307,
       0.8571039 , 0.90100459, 0.90140643, 0.9602899 , 0.97291906,
       0.96178243, 0.92355052, 0.89187715, 0.86095006, 0.94163318,
       0.98105626, 0.97507176, 0.83828932, 0.95397532, 0.88976751,
       0.88669633, 0.96640356])

Check the percentage improvement of the consistency:

In [36]:
100*(sdgmm_consistency_scores[n]/kmeans_consistency_scores[n] - 1)

array([-0.81128859, -0.11034117, -2.0872834 , -0.62169491, -0.05128731,
       -1.07866554, -0.30697222, -1.37864374, -0.3940872 ,  4.95079139,
       -1.09660332, -1.73718928, -1.05199008,  6.60027981, -2.29195045,
       -0.32780326, -0.34201684, -3.22348516,  3.57637686, -0.28728323,
       -2.91135199, -2.02908353, -2.79253153, -1.25495054, -0.8240785 ,
        8.03924387, -0.18289403,  7.16507777, -0.50172933, -0.12190823,
        0.08717651, -0.5164057 , -0.87457606, -0.41594248, -1.96401799,
       -1.12686905, -1.29336434,  1.17497214, -0.38706978, -0.56643016,
       -0.3178298 , -0.86553851, -4.54574711, -1.45855212, -0.25757091,
       -0.91866589, -0.92871966,  3.94953092, -0.47538098, -0.70485008,
       -3.55264223, -0.26433419])

In [37]:
n='cerebellum'

In [38]:
plt.figure()
plt.subplot(211)
plt.xlim(0, 1)
plt.hist(kmeans_consistency_scores[n])
plt.subplot(212)
plt.hist(sdgmm_consistency_scores[n])
plt.xlim(0,1)

<IPython.core.display.Javascript object>

(0.0, 1.0)

## Generating summary plots

Generate and save images with full data on segments: 

In [32]:
for image_id, n in enumerate(image_names):
    print(n)
    full_mass_axis = np.arange(600, 1000, 0.001)
    image_total_spectrum = total_spectrum(profile_images[n], full_mass_axis)
    image_total_spectrum = np.array(image_total_spectrum)
    for lipid_id, lipid_sp in enumerate(lipid_spectra[n][8:18]):
        lipid_id += 8
        print(lipid_id)
        # split peak intensities and masserstein estimates per segment
        single_peak_in_clusters = []
        masserstein_in_clusters = []
        for segment_id in range(1, 3):  # skipping the background cluster
            kmeans_segment_mask = single_peak_kmeans_segmentation[n][..., lipid_id]==segment_id
            sdgmm_segment_mask = masserstein_sdgmm_segmentation[n][..., lipid_id]==segment_id
            single_peak_in_clusters.append(intensity_images[n][kmeans_segment_mask, lipid_id])
            masserstein_in_clusters.append(masserstein_images[n][sdgmm_segment_mask, lipid_id])
        
        nbh_lbound = lipid_sp.confs[0][0] - 1.2
        nbh_ubound = lipid_sp.confs[0][0] + 2.2
        in_nbh = (full_mass_axis >= nbh_lbound)*(full_mass_axis<=nbh_ubound)
        local_mass_axis = full_mass_axis[in_nbh]
        local_total = image_total_spectrum[in_nbh]
        monoisotopic_total = local_total[np.searchsorted(local_mass_axis, lipid_sp.confs[0][0])]
        
        intensity_per_segment = []
        masserstein_per_segment = []
        for segment_id in range(3):
            intensity_per_segment.append(total_spectrum(profile_images[n], 
                                                        local_mass_axis, 
                                                        single_peak_kmeans_segmentation[n][..., lipid_id] == segment_id))
            masserstein_per_segment.append(total_spectrum(profile_images[n], 
                                                        local_mass_axis, 
                                                        masserstein_sdgmm_segmentation[n][..., lipid_id] == segment_id))
        intensity_per_segment = np.array(intensity_per_segment)
        masserstein_per_segment = np.array(masserstein_per_segment)
        #assert np.all(np.sum(intensity_per_segment, axis=0) == total)
        #assert np.all(np.sum(masserstein_per_segment, axis=0) == total)
        # intensity_cluster_spectra = segment_average_spectra(profile_images[n], single_peak_kmeans_segmentation[n][..., lipid_id], mass_axis)
        # masserstein_cluster_spectra = segment_average_spectra(profile_images[n], masserstein_sdgmm_segmentation[n][..., lipid_id], mass_axis)
        plt.figure(figsize=(7.6, 10), dpi=300)
        
        plt.subplot(4, 2, 1)
        plt.title('Peak (profile)\n' + str(round(lipid_sp.confs[0][0], 3)))
        plt.imshow(intensity_images[n][..., lipid_id])
        plt.axis('off')
        
        plt.subplot(4, 2, 2)
        plt.title('Masserstein\n' + lipid_sp.formula)
        plt.imshow(masserstein_images[n][..., lipid_id])
        plt.axis('off')
        
        plt.subplot(4, 2, 3)
        plt.title('K-means')
        plt.imshow(single_peak_kmeans_segmentation[n][..., lipid_id])
        plt.axis('off')
        
        plt.subplot(4, 2, 4)
        plt.title('sDGMM')
        plt.imshow(masserstein_sdgmm_segmentation[n][..., lipid_id])
        plt.axis('off')
        
        plt.subplot(4, 2, 5)
        plt.title('Peak intensity\nper cluster')
        plt.hist(single_peak_in_clusters, stacked=True, bins=80)
        
        plt.subplot(4, 2, 6)
        plt.title('Estimated signal\nper cluster')
        plt.hist(masserstein_in_clusters, stacked=True, bins=80)
        
        plt.subplot(4, 2, 7)
        plt.title('AV')
        plt.plot(local_mass_axis, local_total, lw=0.1, color='k', alpha=0.5)
        plt.annotate('', 
                     xy=(lipid_sp.confs[0][0], monoisotopic_total), 
                     xytext=(lipid_sp.confs[0][0]+0.5, monoisotopic_total+0.5),
                     arrowprops=dict(facecolor='black', width=2))
        for segment_id in range(3):
            plt.fill_between(local_mass_axis, 
                             np.sum(intensity_per_segment[:segment_id,], axis=0),
                             np.sum(intensity_per_segment[:(segment_id+1), ], axis=0),
                             alpha=0.8)
        
        plt.legend(['Total'] + ['Cluster %i' % cl_id for cl_id in range(3)])
        plt.xlim(nbh_lbound, nbh_ubound)
        
        plt.subplot(4, 2, 8)
        plt.plot(local_mass_axis, local_total, lw=0.1, color='k', alpha=0.5)
        (lipid_sp*(monoisotopic_total/lipid_sp.confs[0][1])).plot(show=False, lw=0.5, color='k', alpha=0.5)
        for segment_id in range(3):
            plt.fill_between(local_mass_axis, 
                             np.sum(masserstein_per_segment[:segment_id,], axis=0),
                             np.sum(masserstein_per_segment[:(segment_id+1), ], axis=0),
                             alpha=0.8)
        plt.legend(['Total'] + ['Cluster %i' % cl_id for cl_id in range(3)])
        plt.xlim(nbh_lbound, nbh_ubound)

        plt.tight_layout()
        plt.savefig('Figs/' + n + '_combined_results/%i.png' % lipid_id, dpi=600)
        plt.close()

cerebellum
8
9
10
11
12
13
14
15
16
17
bladder
8
9
10
11
12
13
14
15
16
17
