In [12]:
%matplotlib inline

# path hack for relative import in jupyter notebook
import os
import sys

# LIBRARY GLOBAL MODS
CELLTYPES = os.path.dirname(os.path.abspath(''))
sys.path.append(CELLTYPES)

In [13]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import pandas as pd
import umap
import time

sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

from utils.file_io import RUNS_FOLDER

outdir = RUNS_FOLDER + os.sep + 'explore' + os.sep + 'umap'

## Local utils

In [14]:
from multicell.unsupervised_helper import \
    plotly_express_embedding, generate_control_data, plot_given_multicell, make_dimreduce_object

from singlecell.singlecell_linalg import sorted_eig

## Single umap

In [15]:
# these set the defaults for modifications introduced in main
REDUCER_SEED = 0
REDUCER_COMPONENTS = 3
VALID_REDUCERS = ['umap', 'tsne', 'pca']
REDUCERS_TO_USE = ['umap']
assert REDUCERS_TO_USE == ['umap']  # for now, extend later

# see defaults: https://umap-learn.readthedocs.io/en/latest/api.html
UMAP_KWARGS = {
    'random_state': REDUCER_SEED,
    'n_components': REDUCER_COMPONENTS,
    'metric': 'euclidean',
    'init': 'spectral',
    'unique': False,
    'n_neighbors': 15,
    'min_dist': 0.1,
    'spread': 1.0,
}
TSNE_KWARGS = {
    'random_state': REDUCER_SEED,
    'n_components': REDUCER_COMPONENTS,
    'metric': 'euclidean',
    'init': 'random',
    'perplexity': 30.0,
}
PCA_KWARGS = {
    'n_components': REDUCER_COMPONENTS,
}

### 0) Dataset path

In [119]:
# 0) load dataset
gamma = 1.0
#manyruns_dirname = 'Wmaze15_gamma%.2f_10k_p3_M100' % gamma
#manyruns_dirname = 'Wrandom0_gamma%.2f_10k_periodic_fixedorderV3_p3_M100' % gamma
manyruns_dirname = 'Wvary_s0randomInit_gamma%.2f_10k_periodic_fixedorderV3_p3_M100' % gamma
manyruns_path = RUNS_FOLDER + os.sep + 'multicell_manyruns' + os.sep + manyruns_dirname
    
# Step 0) load data
data_subdict = {'label': manyruns_dirname,
                'path': manyruns_path}

### 1) DImension reduction (store in data_subdict object)

In [None]:
use_01 = True
nsubsample = None

# 1) fill out data_subdict (dim reduce)
data_subdict = make_dimreduce_object(
    data_subdict, 
    nsubsample=nsubsample, 
    flag_control=False,
    use_01=use_01, 
    jitter_scale=0.0,
    reducers=REDUCERS_TO_USE,
    umap_kwargs=UMAP_KWARGS, tsne_kwargs=TSNE_KWARGS, pca_kwargs=PCA_KWARGS,
    step=None)
#save_dimreduce_object(datasets[idx], fpath)  # save to file (joblib)

I:\Development\Repositories\biomodels\celltypes\runs\multicell_manyruns\Wvary_s0randomInit_gamma1.00_10k_periodic_fixedorderV3_p3_M100\aggregate\X_aggregate_last.npz
(10000, 900)


In [None]:
print(data_subdict.keys())
print(data_subdict['algos']['umap'].keys())
print(data_subdict['algos']['umap']['embedding'].shape)

### 2) Visualize umap

In [None]:
# 2) visualize data_subdict
plotly_express_embedding(data_subdict, 
                         color_by_index=False, 
                         as_landscape=False, 
                         fmod='jupyter', 
                         show=False, 
                         dirpath=outdir, 
                         surf=False, 
                         step=None)

# Clustering and plotting of sample points

In [None]:
from sklearn.cluster import KMeans

"""
NOTES:
- kmeans.labels_ will return [cluster_7, cluster_5, ..., cluster_0] -- cluster id for each data point
- also have: kmeans.predict([[0, 0], [12, 3]])
- also have: kmeans.cluster_centers_
"""

#kmeans_highdim = KMeans(n_clusters=8, random_state=0).fit(X)
N_CLUSTERS = 30
CLUSTER_COLOURS = {i: 'blue' for i in range(N_CLUSTERS)}  # TODO implement, currently unused
assert N_CLUSTERS <= len(CLUSTER_COLOURS.keys())

kmeans_lowdim = KMeans(n_clusters=N_CLUSTERS, random_state=0).fit(data_subdict['algos']['umap']['embedding'])

def cluster_to_colour(a):
    return CLUSTER_COLOURS[a]

In [None]:
kmeans_labels = kmeans_lowdim.labels_ 
color_vector = [cluster_to_colour(a) for a in kmeans_labels]  # TODO currently unused

clusterdata = {'color_vector': color_vector, 
               'cluster_ids': kmeans_labels,
               'order': [str(a) for a in range(N_CLUSTERS)],
               'cluster_to_idx': {a: np.where(kmeans_labels == a)[0] for a in range(N_CLUSTERS)}
               }

## Embedding with colour by cluster

In [None]:
plotly_express_embedding(data_subdict, 
                         color_by_index=False, 
                         as_landscape=False, 
                         fmod='jupyter', 
                         show=False, 
                         dirpath=outdir, 
                         surf=False, 
                         step=None,
                         clusterstyle=clusterdata)

## Plot individual points from the clustering

In [None]:
from multicell.multicell_replot import \
    replot_graph_lattice_reference_overlap_plotter, replot_modern, replot_scatter_dots

def plot_tissue_given_agg_idx(agg_index, fmod, outdir, state_int=False):
    
    # TODO do we need to load the varying simsetup_W in each dir of manyruns?
    fpath_pickle = manyruns_path + os.sep + 'multicell_template.pkl'
    with open(fpath_pickle, 'rb') as pickle_file:
        multicell = pickle.load(pickle_file)  # unpickling multicell object

    # update W as it may vary across agg indices of manyruns
    # =======================================================================
    # TODO: does manyruns/s0 correspond to agg0 in manyruns/aggregate
    # =======================================================================
    agg_datadir = manyruns_path + os.sep + 's%d' % agg_index
    W_LOAD = np.loadtxt(agg_datadir + os.sep + 'simsetup' + os.sep + 'matrix_W.txt', delimiter=',')
    print(W_LOAD)
    multicell.matrix_W = W_LOAD
    multicell.simsetup['FIELD_SEND'] = W_LOAD
    
    # constants
    num_cells = multicell.num_cells
    simsetup = multicell.simsetup
    sidelength = int(np.sqrt(num_cells)); assert sidelength ** 2 == num_cells
    
    #smod = ''
    smod = '_last'
    #if step is not None:
    #    smod = '_%d' % step

    agg_dir = manyruns_path + os.sep + 'aggregate'
    fpath_state = agg_dir + os.sep + 'X_aggregate%s.npz' % smod
    fpath_energy = agg_dir + os.sep + 'X_energy%s.npz' % smod
    fpath_pickle = manyruns_path + os.sep + 'multicell_template.pkl'
    print(fpath_state)
    X = np.load(fpath_state)['arr_0'].T  # umap wants transpose
    X_state = X[agg_index, :]
    print(X_state.shape)

    # plot option 2) using replot
    X_state = X_state.reshape(num_cells, simsetup['N'])
    
    #outpath_ref = outdir + os.sep + 'agg%d_ref0' % agg_index
    #replot_graph_lattice_reference_overlap_plotter(
    #    X_state.T, sidelength, outpath_ref, fmod=fmod, ref_node=0)

    outpath = outdir + os.sep + 'agg%d_modern' % agg_index
    replot_modern(X_state.T, simsetup, sidelength, outpath,
                  version='3', fmod=fmod, state_int=state_int)

    outpath = outdir + os.sep + 'agg%d_scatter' % agg_index
    replot_scatter_dots(X_state.T, simsetup, sidelength, outpath,
                        fmod=fmod, state_int=state_int)
    return        

In [118]:
embedding = data_subdict['algos']['umap']['embedding']
print(kmeans_labels)
print(embedding[0,:])

SAMPLES_PER_CLUSTER = 2
for k in range(N_CLUSTERS):
    indices = clusterdata['cluster_to_idx'][k]
    num_points = min(SAMPLES_PER_CLUSTER, len(indices))
    
    cluster_outdir = outdir + os.sep + 'c%d' % k
    assert not os.path.exists(cluster_outdir)
    os.mkdir(cluster_outdir)
    
    for idx in range(num_points):
        print(indices[idx])
        agg_idx = indices[idx]
        print('plotting cluster %d, example %d (agg %d)' % (k, idx, agg_idx))
        
        # now plot agg_idx as example of cluster N
        plot_tissue_given_agg_idx(agg_idx, '', cluster_outdir)
        
        

[ 1 24  0 ...  0 27  4]
[10.555836   3.1789017  6.3717284]
2
plotting cluster 0, example 0 (agg 2)
[[-0.128  -0.4663  0.6931 -0.7861  0.5873 -0.1449 -0.1874 -0.6615 -0.4805]
 [-0.4663  0.2423 -0.8407 -0.5594  0.16   -0.1265 -0.9456 -0.414  -0.2262]
 [ 0.6931 -0.8407  0.0105 -0.3003 -0.6754  0.5531 -0.5056  0.0481  0.664 ]
 [-0.7861 -0.5594 -0.3003 -0.0644  0.4015  0.0712 -0.8657 -0.2868  0.4735]
 [ 0.5873  0.16   -0.6754  0.4015  0.9291  0.9075  0.9877 -0.9086 -0.2416]
 [-0.1449 -0.1265  0.5531  0.0712  0.9075  0.0884  0.9412  0.9663 -0.974 ]
 [-0.1874 -0.9456 -0.5056 -0.8657  0.9877  0.9412  0.6005 -0.1173  0.5948]
 [-0.6615 -0.414   0.0481 -0.2868 -0.9086  0.9663 -0.1173  0.008  -0.4612]
 [-0.4805 -0.2262  0.664   0.4735 -0.2416 -0.974   0.5948 -0.4612  0.1654]]
I:\Development\Repositories\biomodels\celltypes\runs\multicell_manyruns\Wvary_dualInit_gamma1.00_10k_periodic_fixedorderV3_p3_M100\aggregate\X_aggregate_last.npz
(900,)
17
plotting cluster 0, example 1 (agg 17)
[[-0.4107 -0.2

41
plotting cluster 5, example 0 (agg 41)
[[-0.4982 -0.1624  0.4906  0.1672  0.1644 -0.4347 -0.4348  0.6595 -0.2127]
 [-0.1624 -0.3355 -0.2036 -0.7999  0.7452  0.6526 -0.447  -0.9458  0.1565]
 [ 0.4906 -0.2036  0.2165  0.4827  0.5787  0.7487 -0.6874 -0.5181 -0.2438]
 [ 0.1672 -0.7999  0.4827 -0.8336 -0.5638 -0.7145  0.3276 -0.4673 -0.5926]
 [ 0.1644  0.7452  0.5787 -0.5638 -0.3068 -0.4399  0.8856 -0.2396 -0.2716]
 [-0.4347  0.6526  0.7487 -0.7145 -0.4399  0.0503  0.3626 -0.0415  0.406 ]
 [-0.4348 -0.447  -0.6874  0.3276  0.8856  0.3626  0.726  -0.3572  0.4745]
 [ 0.6595 -0.9458 -0.5181 -0.4673 -0.2396 -0.0415 -0.3572 -0.1259  0.3652]
 [-0.2127  0.1565 -0.2438 -0.5926 -0.2716  0.406   0.4745  0.3652 -0.7996]]
I:\Development\Repositories\biomodels\celltypes\runs\multicell_manyruns\Wvary_dualInit_gamma1.00_10k_periodic_fixedorderV3_p3_M100\aggregate\X_aggregate_last.npz
(900,)
84
plotting cluster 5, example 1 (agg 84)
[[-0.9079 -0.738   0.8811 -0.5808 -0.6778 -0.7594 -0.0969  0.6647 -0.54

65
plotting cluster 10, example 0 (agg 65)
[[-0.5627 -0.5284 -0.7986 -0.1079 -0.8992 -0.8143 -0.9003 -0.2916 -0.678 ]
 [-0.5284 -0.5399  0.8552 -0.6406  0.1033  0.7412  0.4404 -0.4292 -0.8087]
 [-0.7986  0.8552 -0.5595  0.9701  0.6137 -0.3364  0.7695 -0.4427  0.2515]
 [-0.1079 -0.6406  0.9701  0.3265 -0.2467  0.8712 -0.9386  0.7895 -0.7451]
 [-0.8992  0.1033  0.6137 -0.2467  0.8626  0.2811  0.6235  0.347   0.8801]
 [-0.8143  0.7412 -0.3364  0.8712  0.2811 -0.875  -0.7842 -0.3642 -0.066 ]
 [-0.9003  0.4404  0.7695 -0.9386  0.6235 -0.7842  0.9053 -0.7136  0.8626]
 [-0.2916 -0.4292 -0.4427  0.7895  0.347  -0.3642 -0.7136 -0.9158  0.0669]
 [-0.678  -0.8087  0.2515 -0.7451  0.8801 -0.066   0.8626  0.0669  0.2033]]
I:\Development\Repositories\biomodels\celltypes\runs\multicell_manyruns\Wvary_dualInit_gamma1.00_10k_periodic_fixedorderV3_p3_M100\aggregate\X_aggregate_last.npz
(900,)
91
plotting cluster 10, example 1 (agg 91)
[[-0.598   0.2548  0.1643  0.3035 -0.0685 -0.4242  0.2316  0.3483 -0.

38
plotting cluster 14, example 1 (agg 38)
[[-0.2305 -0.8508  0.0097 -0.1029 -0.1736 -0.8191 -0.2734  0.8397 -0.9457]
 [-0.8508 -0.4381  0.6464  0.5985  0.4584 -0.7577 -0.4999  0.1586  0.0526]
 [ 0.0097  0.6464  0.9207  0.4567 -0.9698  0.984  -0.1163  0.3403 -0.0887]
 [-0.1029  0.5985  0.4567 -0.8237 -0.6314 -0.352   0.2025  0.288   0.2836]
 [-0.1736  0.4584 -0.9698 -0.6314 -0.1139 -0.1965 -0.3411  0.325   0.0879]
 [-0.8191 -0.7577  0.984  -0.352  -0.1965 -0.0425  0.2966 -0.2175  0.9352]
 [-0.2734 -0.4999 -0.1163  0.2025 -0.3411  0.2966 -0.1655  0.4576  0.5758]
 [ 0.8397  0.1586  0.3403  0.288   0.325  -0.2175  0.4576  0.272   0.2315]
 [-0.9457  0.0526 -0.0887  0.2836  0.0879  0.9352  0.5758  0.2315 -0.1667]]
I:\Development\Repositories\biomodels\celltypes\runs\multicell_manyruns\Wvary_dualInit_gamma1.00_10k_periodic_fixedorderV3_p3_M100\aggregate\X_aggregate_last.npz
(900,)
40
plotting cluster 15, example 0 (agg 40)
[[-0.1846  0.3725  0.6706  0.6701  0.0978  0.44   -0.611  -0.0862 -0.

14
plotting cluster 19, example 1 (agg 14)
[[ 0.0279 -0.5575  0.5048  0.314  -0.1842  0.9976  0.9914  0.0765  0.0162]
 [-0.5575  0.613   0.5263 -0.636   0.9828 -0.4242 -0.5067 -0.7593  0.1585]
 [ 0.5048  0.5263  0.741  -0.3559 -0.0957 -0.0764 -0.7799 -0.6939  0.2772]
 [ 0.314  -0.636  -0.3559  0.7751 -0.6202  0.4705  0.0046 -0.0996 -0.5719]
 [-0.1842  0.9828 -0.0957 -0.6202 -0.2967  0.8874 -0.0878 -0.1419 -0.1648]
 [ 0.9976 -0.4242 -0.0764  0.4705  0.8874 -0.6492  0.469   0.4673 -0.0664]
 [ 0.9914 -0.5067 -0.7799  0.0046 -0.0878  0.469   0.2419 -0.8529 -0.8339]
 [ 0.0765 -0.7593 -0.6939 -0.0996 -0.1419  0.4673 -0.8529 -0.4424  0.1405]
 [ 0.0162  0.1585  0.2772 -0.5719 -0.1648 -0.0664 -0.8339  0.1405 -0.543 ]]
I:\Development\Repositories\biomodels\celltypes\runs\multicell_manyruns\Wvary_dualInit_gamma1.00_10k_periodic_fixedorderV3_p3_M100\aggregate\X_aggregate_last.npz
(900,)
5
plotting cluster 20, example 0 (agg 5)
[[-0.556  -0.6246  0.2576  0.9279 -0.9967 -0.061  -0.7771  0.8253 -0.26

1
plotting cluster 24, example 0 (agg 1)
[[-0.166   0.0776 -0.7192 -0.9219  0.373  -0.1042 -0.0169 -0.1716 -0.7214]
 [ 0.0776 -0.1616 -0.6038 -0.6603  0.6693  0.8172 -0.8933 -0.9001  0.6148]
 [-0.7192 -0.6038  0.6015  0.7563 -0.9634 -0.4128  0.1482  0.0718 -0.2046]
 [-0.9219 -0.6603  0.7563 -0.8033  0.5003 -0.4244 -0.7065  0.3276 -0.6693]
 [ 0.373   0.6693 -0.9634  0.5003  0.9777 -0.7399  0.1786  0.0298  0.855 ]
 [-0.1042  0.8172 -0.4128 -0.4244 -0.7399 -0.9613  0.3995  0.8892 -0.3045]
 [-0.0169 -0.8933  0.1482 -0.7065  0.1786  0.3995 -0.7953  0.1731  0.5016]
 [-0.1716 -0.9001  0.0718  0.3276  0.0298  0.8892  0.1731  0.8068  0.452 ]
 [-0.7214  0.6148 -0.2046 -0.6693  0.855  -0.3045  0.5016  0.452   0.7666]]
I:\Development\Repositories\biomodels\celltypes\runs\multicell_manyruns\Wvary_dualInit_gamma1.00_10k_periodic_fixedorderV3_p3_M100\aggregate\X_aggregate_last.npz
(900,)
19
plotting cluster 24, example 1 (agg 19)
[[-0.8049  0.2713 -0.226   0.5003  0.0839 -0.6405  0.8726  0.9347 -0.73

75
plotting cluster 29, example 0 (agg 75)
[[ 0.1379  0.8672 -0.632   0.3903 -0.8296 -0.1849 -0.0637  0.4641 -0.0504]
 [ 0.8672  0.2963  0.4499  0.8969  0.7505 -0.1697  0.5589  0.918  -0.1479]
 [-0.632   0.4499  0.9979 -0.8443  0.9094 -0.6042 -0.433   0.0626  0.5796]
 [ 0.3903  0.8969 -0.8443 -0.5799  0.4568  0.9713 -0.7833  0.6074 -0.7659]
 [-0.8296  0.7505  0.9094  0.4568 -0.9495 -0.3734  0.7635 -0.0254 -0.7329]
 [-0.1849 -0.1697 -0.6042  0.9713 -0.3734 -0.6419  0.8814  0.524  -0.319 ]
 [-0.0637  0.5589 -0.433  -0.7833  0.7635  0.8814  0.2036  0.1904 -0.7422]
 [ 0.4641  0.918   0.0626  0.6074 -0.0254  0.524   0.1904 -0.0545  0.4055]
 [-0.0504 -0.1479  0.5796 -0.7659 -0.7329 -0.319  -0.7422  0.4055  0.1719]]
I:\Development\Repositories\biomodels\celltypes\runs\multicell_manyruns\Wvary_dualInit_gamma1.00_10k_periodic_fixedorderV3_p3_M100\aggregate\X_aggregate_last.npz
(900,)
135
plotting cluster 29, example 1 (agg 135)
[[ 0.3227  0.8605 -0.9178  0.6709  0.8689 -0.9015  0.0931  0.9358  


