In [12]:
%matplotlib inline

# path hack for relative import in jupyter notebook
import os
import sys

# LIBRARY GLOBAL MODS
CELLTYPES = os.path.dirname(os.path.abspath(''))
sys.path.append(CELLTYPES)

In [13]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import pandas as pd
import umap
import time

sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

from utils.file_io import RUNS_FOLDER

outdir = RUNS_FOLDER + os.sep + 'explore' + os.sep + 'umap'

## Local utils

In [14]:
from multicell.unsupervised_helper import \
    plotly_express_embedding, generate_control_data, plot_given_multicell, make_dimreduce_object

from singlecell.singlecell_linalg import sorted_eig

## Single umap

In [15]:
# these set the defaults for modifications introduced in main
REDUCER_SEED = 0
REDUCER_COMPONENTS = 3
VALID_REDUCERS = ['umap', 'tsne', 'pca']
REDUCERS_TO_USE = ['umap']
assert REDUCERS_TO_USE == ['umap']  # for now, extend later

# see defaults: https://umap-learn.readthedocs.io/en/latest/api.html
UMAP_KWARGS = {
    'random_state': REDUCER_SEED,
    'n_components': REDUCER_COMPONENTS,
    'metric': 'euclidean',
    'init': 'spectral',
    'unique': False,
    'n_neighbors': 15,
    'min_dist': 0.1,
    'spread': 1.0,
}
TSNE_KWARGS = {
    'random_state': REDUCER_SEED,
    'n_components': REDUCER_COMPONENTS,
    'metric': 'euclidean',
    'init': 'random',
    'perplexity': 30.0,
}
PCA_KWARGS = {
    'n_components': REDUCER_COMPONENTS,
}

### 0) Dataset path

In [103]:
# 0) load dataset
gamma = 20.0
#manyruns_dirname = 'Wmaze15_gamma%.2f_10k_p3_M100' % gamma
manyruns_dirname = 'Wrandom0_gamma%.2f_10k_periodic_fixedorderV3_p3_M100' % gamma
manyruns_path = RUNS_FOLDER + os.sep + 'multicell_manyruns' + os.sep + manyruns_dirname
    
# Step 0) load data
data_subdict = {'label': manyruns_dirname,
                'path': manyruns_path}

### 1) DImension reduction (store in data_subdict object)

In [104]:
use_01 = True
nsubsample = None

# 1) fill out data_subdict (dim reduce)
data_subdict = make_dimreduce_object(
    data_subdict, 
    nsubsample=nsubsample, 
    flag_control=False,
    use_01=use_01, 
    jitter_scale=0.0,
    reducers=REDUCERS_TO_USE,
    umap_kwargs=UMAP_KWARGS, tsne_kwargs=TSNE_KWARGS, pca_kwargs=PCA_KWARGS,
    step=None)
#save_dimreduce_object(datasets[idx], fpath)  # save to file (joblib)

I:\Development\Repositories\biomodels\celltypes\runs\multicell_manyruns\Wrandom0_gamma20.00_10k_periodic_fixedorderV3_p3_M100\aggregate\X_aggregate_last.npz
(10000, 900)
Time to fit (umap): 25.36 sec


In [105]:
print(data_subdict.keys())
print(data_subdict['algos']['umap'].keys())
print(data_subdict['algos']['umap']['embedding'].shape)

dict_keys(['label', 'path', 'data', 'index', 'energies', 'num_runs', 'total_spins', 'multicell_template', 'algos'])
dict_keys(['reducer', 'embedding'])
(10000, 3)


### 2) Visualize umap

In [106]:
# 2) visualize data_subdict
plotly_express_embedding(data_subdict, 
                         color_by_index=False, 
                         as_landscape=False, 
                         fmod='jupyter', 
                         show=False, 
                         dirpath=outdir, 
                         surf=False, 
                         step=None)

# Clustering and plotting of sample points

In [107]:
from sklearn.cluster import KMeans

"""
NOTES:
- kmeans.labels_ will return [cluster_7, cluster_5, ..., cluster_0] -- cluster id for each data point
- also have: kmeans.predict([[0, 0], [12, 3]])
- also have: kmeans.cluster_centers_
"""

#kmeans_highdim = KMeans(n_clusters=8, random_state=0).fit(X)
N_CLUSTERS = 30
CLUSTER_COLOURS = {i: 'blue' for i in range(N_CLUSTERS)}  # TODO implement, currently unused
assert N_CLUSTERS <= len(CLUSTER_COLOURS.keys())

kmeans_lowdim = KMeans(n_clusters=N_CLUSTERS, random_state=0).fit(data_subdict['algos']['umap']['embedding'])

def cluster_to_colour(a):
    return CLUSTER_COLOURS[a]

In [108]:
kmeans_labels = kmeans_lowdim.labels_ 
color_vector = [cluster_to_colour(a) for a in kmeans_labels]  # TODO currently unused

clusterdata = {'color_vector': color_vector, 
               'cluster_ids': kmeans_labels,
               'order': [str(a) for a in range(N_CLUSTERS)],
               'cluster_to_idx': {a: np.where(kmeans_labels == a)[0] for a in range(N_CLUSTERS)}
               }

## Embedding with colour by cluster

In [109]:
plotly_express_embedding(data_subdict, 
                         color_by_index=False, 
                         as_landscape=False, 
                         fmod='jupyter', 
                         show=False, 
                         dirpath=outdir, 
                         surf=False, 
                         step=None,
                         clusterstyle=clusterdata)

## Plot individual points from the clustering

In [101]:
from multicell.multicell_replot import \
    replot_graph_lattice_reference_overlap_plotter, replot_modern, replot_scatter_dots

def plot_tissue_given_agg_idx(agg_index, fmod, outdir, state_int=False):
    
    # TODO do we need to load the varying simsetup_W in each dir of manyruns?
    fpath_pickle = manyruns_path + os.sep + 'multicell_template.pkl'
    with open(fpath_pickle, 'rb') as pickle_file:
        multicell = pickle.load(pickle_file)  # unpickling multicell object

    # update W as it may vary across agg indices of manyruns
    # =======================================================================
    # TODO: does manyruns/s0 correspond to agg0 in manyruns/aggregate
    # =======================================================================
    agg_datadir = manyruns_path + os.sep + 's%d' % agg_index
    W_LOAD = np.loadtxt(agg_datadir + os.sep + 'simsetup' + os.sep + 'matrix_W.txt', delimiter=',')
    print(W_LOAD)
    multicell.matrix_W = W_LOAD
    multicell.simsetup['FIELD_SEND'] = W_LOAD
    
    # constants
    num_cells = multicell.num_cells
    simsetup = multicell.simsetup
    sidelength = int(np.sqrt(num_cells)); assert sidelength ** 2 == num_cells
    
    #smod = ''
    smod = '_last'
    #if step is not None:
    #    smod = '_%d' % step

    agg_dir = manyruns_path + os.sep + 'aggregate'
    fpath_state = agg_dir + os.sep + 'X_aggregate%s.npz' % smod
    fpath_energy = agg_dir + os.sep + 'X_energy%s.npz' % smod
    fpath_pickle = manyruns_path + os.sep + 'multicell_template.pkl'
    print(fpath_state)
    X = np.load(fpath_state)['arr_0'].T  # umap wants transpose
    X_state = X[agg_index, :]
    print(X_state.shape)

    # plot option 2) using replot
    X_state = X_state.reshape(num_cells, simsetup['N'])
    
    #outpath_ref = outdir + os.sep + 'agg%d_ref0' % agg_index
    #replot_graph_lattice_reference_overlap_plotter(
    #    X_state.T, sidelength, outpath_ref, fmod=fmod, ref_node=0)

    outpath = outdir + os.sep + 'agg%d_modern' % agg_index
    replot_modern(X_state.T, simsetup, sidelength, outpath,
                  version='3', fmod=fmod, state_int=state_int)

    outpath = outdir + os.sep + 'agg%d_scatter' % agg_index
    replot_scatter_dots(X_state.T, simsetup, sidelength, outpath,
                        fmod=fmod, state_int=state_int)
    return        

In [102]:
embedding = data_subdict['algos']['umap']['embedding']
print(kmeans_labels)
print(embedding[0,:])

SAMPLES_PER_CLUSTER = 2
for k in range(N_CLUSTERS):
    indices = clusterdata['cluster_to_idx'][k]
    num_points = min(SAMPLES_PER_CLUSTER, len(indices))
    
    cluster_outdir = outdir + os.sep + 'c%d' % k
    assert not os.path.exists(cluster_outdir)
    os.mkdir(cluster_outdir)
    
    for idx in range(num_points):
        print(indices[idx])
        agg_idx = indices[idx]
        print('plotting cluster %d, example %d (agg %d)' % (k, idx, agg_idx))
        
        # now plot agg_idx as example of cluster N
        plot_tissue_given_agg_idx(agg_idx, '', cluster_outdir)
        
        

[ 5  7 19 ... 16 19  5]
[ 2.8478982  -0.27260047  2.7478404 ]
13
plotting cluster 0, example 0 (agg 13)
[[ 0.5582  0.4684 -0.9308  0.8925  0.0115  0.9     0.9612  0.8623 -0.2361]
 [ 0.4684  0.0982 -0.7404 -0.2806  0.7634  0.7394  0.9951 -0.8231  0.6741]
 [-0.9308 -0.7404 -0.8937 -0.4993  0.614  -0.7999 -0.638   0.2284  0.7981]
 [ 0.8925 -0.2806 -0.4993  0.6032  0.88   -0.8589  0.2933 -0.3111 -0.1149]
 [ 0.0115  0.7634  0.614   0.88   -0.5619  0.2813 -0.9194 -0.0667 -0.5233]
 [ 0.9     0.7394 -0.7999 -0.8589  0.2813 -0.1074  0.6673 -0.6074 -0.199 ]
 [ 0.9612  0.9951 -0.638   0.2933 -0.9194  0.6673 -0.0893 -0.7373  0.8924]
 [ 0.8623 -0.8231  0.2284 -0.3111 -0.0667 -0.6074 -0.7373 -0.7725 -0.851 ]
 [-0.2361  0.6741  0.7981 -0.1149 -0.5233 -0.199   0.8924 -0.851  -0.2568]]
I:\Development\Repositories\biomodels\celltypes\runs\multicell_manyruns\Wrandom0_gamma1.00_10k_periodic_fixedorderV3_p3_M100\aggregate\X_aggregate_last.npz
(900,)
19
plotting cluster 0, example 1 (agg 19)
[[ 0.5582  0.46

0
plotting cluster 5, example 0 (agg 0)
[[ 0.5582  0.4684 -0.9308  0.8925  0.0115  0.9     0.9612  0.8623 -0.2361]
 [ 0.4684  0.0982 -0.7404 -0.2806  0.7634  0.7394  0.9951 -0.8231  0.6741]
 [-0.9308 -0.7404 -0.8937 -0.4993  0.614  -0.7999 -0.638   0.2284  0.7981]
 [ 0.8925 -0.2806 -0.4993  0.6032  0.88   -0.8589  0.2933 -0.3111 -0.1149]
 [ 0.0115  0.7634  0.614   0.88   -0.5619  0.2813 -0.9194 -0.0667 -0.5233]
 [ 0.9     0.7394 -0.7999 -0.8589  0.2813 -0.1074  0.6673 -0.6074 -0.199 ]
 [ 0.9612  0.9951 -0.638   0.2933 -0.9194  0.6673 -0.0893 -0.7373  0.8924]
 [ 0.8623 -0.8231  0.2284 -0.3111 -0.0667 -0.6074 -0.7373 -0.7725 -0.851 ]
 [-0.2361  0.6741  0.7981 -0.1149 -0.5233 -0.199   0.8924 -0.851  -0.2568]]
I:\Development\Repositories\biomodels\celltypes\runs\multicell_manyruns\Wrandom0_gamma1.00_10k_periodic_fixedorderV3_p3_M100\aggregate\X_aggregate_last.npz
(900,)
10
plotting cluster 5, example 1 (agg 10)
[[ 0.5582  0.4684 -0.9308  0.8925  0.0115  0.9     0.9612  0.8623 -0.2361]
 [ 0

3
plotting cluster 10, example 0 (agg 3)
[[ 0.5582  0.4684 -0.9308  0.8925  0.0115  0.9     0.9612  0.8623 -0.2361]
 [ 0.4684  0.0982 -0.7404 -0.2806  0.7634  0.7394  0.9951 -0.8231  0.6741]
 [-0.9308 -0.7404 -0.8937 -0.4993  0.614  -0.7999 -0.638   0.2284  0.7981]
 [ 0.8925 -0.2806 -0.4993  0.6032  0.88   -0.8589  0.2933 -0.3111 -0.1149]
 [ 0.0115  0.7634  0.614   0.88   -0.5619  0.2813 -0.9194 -0.0667 -0.5233]
 [ 0.9     0.7394 -0.7999 -0.8589  0.2813 -0.1074  0.6673 -0.6074 -0.199 ]
 [ 0.9612  0.9951 -0.638   0.2933 -0.9194  0.6673 -0.0893 -0.7373  0.8924]
 [ 0.8623 -0.8231  0.2284 -0.3111 -0.0667 -0.6074 -0.7373 -0.7725 -0.851 ]
 [-0.2361  0.6741  0.7981 -0.1149 -0.5233 -0.199   0.8924 -0.851  -0.2568]]
I:\Development\Repositories\biomodels\celltypes\runs\multicell_manyruns\Wrandom0_gamma1.00_10k_periodic_fixedorderV3_p3_M100\aggregate\X_aggregate_last.npz
(900,)
5
plotting cluster 10, example 1 (agg 5)
[[ 0.5582  0.4684 -0.9308  0.8925  0.0115  0.9     0.9612  0.8623 -0.2361]
 [ 0

12
plotting cluster 15, example 0 (agg 12)
[[ 0.5582  0.4684 -0.9308  0.8925  0.0115  0.9     0.9612  0.8623 -0.2361]
 [ 0.4684  0.0982 -0.7404 -0.2806  0.7634  0.7394  0.9951 -0.8231  0.6741]
 [-0.9308 -0.7404 -0.8937 -0.4993  0.614  -0.7999 -0.638   0.2284  0.7981]
 [ 0.8925 -0.2806 -0.4993  0.6032  0.88   -0.8589  0.2933 -0.3111 -0.1149]
 [ 0.0115  0.7634  0.614   0.88   -0.5619  0.2813 -0.9194 -0.0667 -0.5233]
 [ 0.9     0.7394 -0.7999 -0.8589  0.2813 -0.1074  0.6673 -0.6074 -0.199 ]
 [ 0.9612  0.9951 -0.638   0.2933 -0.9194  0.6673 -0.0893 -0.7373  0.8924]
 [ 0.8623 -0.8231  0.2284 -0.3111 -0.0667 -0.6074 -0.7373 -0.7725 -0.851 ]
 [-0.2361  0.6741  0.7981 -0.1149 -0.5233 -0.199   0.8924 -0.851  -0.2568]]
I:\Development\Repositories\biomodels\celltypes\runs\multicell_manyruns\Wrandom0_gamma1.00_10k_periodic_fixedorderV3_p3_M100\aggregate\X_aggregate_last.npz
(900,)
65
plotting cluster 15, example 1 (agg 65)
[[ 0.5582  0.4684 -0.9308  0.8925  0.0115  0.9     0.9612  0.8623 -0.2361]


17
plotting cluster 20, example 0 (agg 17)
[[ 0.5582  0.4684 -0.9308  0.8925  0.0115  0.9     0.9612  0.8623 -0.2361]
 [ 0.4684  0.0982 -0.7404 -0.2806  0.7634  0.7394  0.9951 -0.8231  0.6741]
 [-0.9308 -0.7404 -0.8937 -0.4993  0.614  -0.7999 -0.638   0.2284  0.7981]
 [ 0.8925 -0.2806 -0.4993  0.6032  0.88   -0.8589  0.2933 -0.3111 -0.1149]
 [ 0.0115  0.7634  0.614   0.88   -0.5619  0.2813 -0.9194 -0.0667 -0.5233]
 [ 0.9     0.7394 -0.7999 -0.8589  0.2813 -0.1074  0.6673 -0.6074 -0.199 ]
 [ 0.9612  0.9951 -0.638   0.2933 -0.9194  0.6673 -0.0893 -0.7373  0.8924]
 [ 0.8623 -0.8231  0.2284 -0.3111 -0.0667 -0.6074 -0.7373 -0.7725 -0.851 ]
 [-0.2361  0.6741  0.7981 -0.1149 -0.5233 -0.199   0.8924 -0.851  -0.2568]]
I:\Development\Repositories\biomodels\celltypes\runs\multicell_manyruns\Wrandom0_gamma1.00_10k_periodic_fixedorderV3_p3_M100\aggregate\X_aggregate_last.npz
(900,)
41
plotting cluster 20, example 1 (agg 41)
[[ 0.5582  0.4684 -0.9308  0.8925  0.0115  0.9     0.9612  0.8623 -0.2361]


35
plotting cluster 25, example 0 (agg 35)
[[ 0.5582  0.4684 -0.9308  0.8925  0.0115  0.9     0.9612  0.8623 -0.2361]
 [ 0.4684  0.0982 -0.7404 -0.2806  0.7634  0.7394  0.9951 -0.8231  0.6741]
 [-0.9308 -0.7404 -0.8937 -0.4993  0.614  -0.7999 -0.638   0.2284  0.7981]
 [ 0.8925 -0.2806 -0.4993  0.6032  0.88   -0.8589  0.2933 -0.3111 -0.1149]
 [ 0.0115  0.7634  0.614   0.88   -0.5619  0.2813 -0.9194 -0.0667 -0.5233]
 [ 0.9     0.7394 -0.7999 -0.8589  0.2813 -0.1074  0.6673 -0.6074 -0.199 ]
 [ 0.9612  0.9951 -0.638   0.2933 -0.9194  0.6673 -0.0893 -0.7373  0.8924]
 [ 0.8623 -0.8231  0.2284 -0.3111 -0.0667 -0.6074 -0.7373 -0.7725 -0.851 ]
 [-0.2361  0.6741  0.7981 -0.1149 -0.5233 -0.199   0.8924 -0.851  -0.2568]]
I:\Development\Repositories\biomodels\celltypes\runs\multicell_manyruns\Wrandom0_gamma1.00_10k_periodic_fixedorderV3_p3_M100\aggregate\X_aggregate_last.npz
(900,)
46
plotting cluster 25, example 1 (agg 46)
[[ 0.5582  0.4684 -0.9308  0.8925  0.0115  0.9     0.9612  0.8623 -0.2361]



