In [12]:
%matplotlib inline

# path hack for relative import in jupyter notebook
import os
import sys

# LIBRARY GLOBAL MODS
CELLTYPES = os.path.dirname(os.path.abspath(''))
sys.path.append(CELLTYPES)

In [13]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import pandas as pd
import umap
import time
%matplotlib inline

sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

from utils.file_io import RUNS_FOLDER

## Local utils

In [14]:
from multicell.unsupervised_helper import \
    plotly_express_embedding, generate_control_data, plot_given_multicell, make_dimreduce_object

from singlecell.singlecell_linalg import sorted_eig

## Single umap

In [15]:
# these set the defaults for modifications introduced in main
REDUCER_SEED = 0
REDUCER_COMPONENTS = 3
VALID_REDUCERS = ['umap', 'tsne', 'pca']
REDUCERS_TO_USE = ['umap']
assert REDUCERS_TO_USE == ['umap']  # for now, extend later

# see defaults: https://umap-learn.readthedocs.io/en/latest/api.html
UMAP_KWARGS = {
    'random_state': REDUCER_SEED,
    'n_components': REDUCER_COMPONENTS,
    'metric': 'euclidean',
    'init': 'spectral',
    'unique': False,
    'n_neighbors': 15,
    'min_dist': 0.1,
    'spread': 1.0,
}
TSNE_KWARGS = {
    'random_state': REDUCER_SEED,
    'n_components': REDUCER_COMPONENTS,
    'metric': 'euclidean',
    'init': 'random',
    'perplexity': 30.0,
}
PCA_KWARGS = {
    'n_components': REDUCER_COMPONENTS,
}

### 0) Dataset path

In [16]:
# 0) load dataset
gamma = 1.0
manyruns_dirname = 'Wmaze15_gamma%.2f_10k_p3_M100' % gamma
manyruns_path = RUNS_FOLDER + os.sep + 'multicell_manyruns' + os.sep + manyruns_dirname
    
# Step 0) load data
data_subdict = {'label': manyruns_dirname,
                'path': manyruns_path}

### 1) DImension reduction (store in data_subdict object)

In [None]:
use_01 = True
nsubsample = None

# 1) fill out data_subdict (dim reduce)
data_subdict = make_dimreduce_object(
    data_subdict, 
    nsubsample=nsubsample, 
    flag_control=False,
    use_01=use_01, 
    jitter_scale=0.0,
    reducers=REDUCERS_TO_USE,
    umap_kwargs=UMAP_KWARGS, tsne_kwargs=TSNE_KWARGS, pca_kwargs=PCA_KWARGS,
    step=None)
#save_dimreduce_object(datasets[idx], fpath)  # save to file (joblib)

I:\Development\Repositories\biomodels\celltypes\runs\multicell_manyruns\Wmaze15_gamma1.00_10k_p3_M100\aggregate\X_aggregate_last.npz
(10000, 900)


In [None]:
print(data_subdict.keys())
print(data_subdict['algos']['umap'].keys())
print(data_subdict['algos']['umap']['embedding'].shape)

### 2) Visualize umap

In [None]:
outdir = RUNS_FOLDER + os.sep + 'explore' + os.sep + 'umap'

# 2) visualize data_subdict
plotly_express_embedding(data_subdict, 
                         color_by_index=False, 
                         as_landscape=False, 
                         fmod='jupyter', 
                         show=False, 
                         dirpath=outdir, 
                         surf=False, 
                         step=None)

# Clustering and plotting of sample points

In [None]:
from sklearn.cluster import KMeans

"""
NOTES:
- kmeans.labels_ will return [cluster_7, cluster_5, ..., cluster_0] -- cluster id for each data point
- also have: kmeans.predict([[0, 0], [12, 3]])
- also have: kmeans.cluster_centers_
"""

#kmeans_highdim = KMeans(n_clusters=8, random_state=0).fit(X)
N_CLUSTERS = 8
CLUSTER_COLOURS = {0: 'blue', 1: 'red', 2:'green', 3:'purple', 4: 'pink', 5: 'brown', 6: 'gray', 7: 'black'}
assert N_CLUSTERS <= len(CLUSTER_COLOURS.keys())

kmeans_lowdim = KMeans(n_clusters=N_CLUSTERS, random_state=0).fit(data_subdict['algos']['umap']['embedding'])

In [None]:
def cluster_to_colour(a):
    return CLUSTER_COLOURS[a]


kmeans_labels = kmeans_lowdim.labels_ 
color_vector = [cluster_to_colour(a) for a in kmeans_labels]


#clusterstyle = {'color_vector': color_vector, 
#                'cluster_ids': kmeans_labels.astype('str')}
clusterstyle = {'color_vector': color_vector, 
                'cluster_ids': kmeans_labels,
                'order': [str(a) for a in range(N_CLUSTERS)]
               }

In [None]:
plotly_express_embedding(data_subdict, 
                         color_by_index=False, 
                         as_landscape=False, 
                         fmod='jupyter', 
                         show=False, 
                         dirpath=outdir, 
                         surf=False, 
                         step=None,
                         clusterstyle=clusterstyle)

# OLD BELOW

In [26]:
print(kmeans_lowdim.labels_)

[2 1 4 ... 4 0 7]


In [27]:
fcontents['colours_highdim'] = [label_to_colour(a) for a in kmeans_highdim.labels_]
fcontents['colours_lowdim'] = [label_to_colour(a) for a in kmeans_lowdim.labels_]

NameError: name 'kmeans_highdim' is not defined

In [None]:
def umap_plotly_clustered(data_subdict):   
    
    import plotly.express as px

    num_runs = data_subdict['num_runs']
    label = data_subdict['label']
    embedding = data_subdict['algos']['umap']['embedding']
    reducer = data_subdict['algos']['umap']['reducer']
    #c = data_subdict['energies'][:, 0]  # range(num_runs)
    c = data_subdict['colours_lowdim']

    umap_dim = embedding.shape[1]
    assert umap_dim in [2,3]
    
    if umap_dim == 2:
        df = pd.DataFrame({'index': range(num_runs),
                           'c': c,
                           'x': embedding[:,0],
                           'y': embedding[:,1]})

        fig = px.scatter(df, x='x', y='y',
                         color='c',
                         title='UMAP of %s dataset' % label,
                        )

    else:
        df = pd.DataFrame({'index': range(num_runs),
                       'c': c,
                       'x': embedding[:,0],
                       'y': embedding[:,1],
                       'z': embedding[:,2]})
    
        fig = px.scatter_3d(df, x='x', y='y', z='z', 
                            color='c',
                            title='UMAP of %s dataset' % label,
                           )
    fig.write_html("umap_plotly_%s_clustered.html" % label)
    fig.show()
    
def plotly_express_embedding_clustered(data_subdict):
    """
    Supports 2D and 3D embeddings
    """
    
    import plotly.express as px


    num_runs = data_subdict['num_runs']
    label = data_subdict['label']
    dirpath = data_subdict['path'] + os.sep + 'dimreduce'
    #c = data_subdict['energies'][:, 0]  # range(num_runs)
    c = data_subdict['colours_highdim']

    
    for key, algodict in data_subdict['algos'].items():
        algo = key
        reducer = algodict['reducer']
        embedding = algodict['embedding']

        n_components = embedding.shape[1]
        assert n_components in [2, 3]

        if n_components == 2:
            df = pd.DataFrame({'index': range(num_runs),
                               'energy': c,
                               'x': embedding[:, 0],
                               'y': embedding[:, 1]})

            fig = px.scatter(df, x='x', y='y',
                             color='energy',
                             title='%s of %s dataset' % (algo, label))

        else:
            df = pd.DataFrame({'index': range(num_runs),
                               'energy': c,
                               'x': embedding[:, 0],
                               'y': embedding[:, 1],
                               'z': embedding[:, 2]})

            fig = px.scatter_3d(df, x='x', y='y', z='z',
                                color='energy',
                                title='%s of %s dataset' % (algo, label))

        if not os.path.exists(dirpath):
            os.makedirs(dirpath)
        fig.write_html(dirpath + os.sep + "%s_plotly_%s_clustered.html" % (algo, label))
        fig.show()
    return

plotly_express_embedding_clustered(fcontents)

# Visualize arbitrary point in umap

In [None]:
def plot_given_multicell(multicell, step_hack, agg_index, outdir):
    fpaths = [outdir + os.sep + a for a in
              ['agg%d_compOverlap.png' % agg_index,
               'agg%d_compProj.png' % agg_index,
               'agg%d_ref0_overlap.png' % agg_index]
          ]
    multicell.step_datadict_update_global(step_hack, fill_to_end=False)
    multicell.step_state_visualize(step_hack, fpaths=fpaths)  # visualize
    return

In [None]:
# A: [325, 269, 3918, 5329]
# B: [2145, 8616, 6241, 1632]

# CHOICES
agg_indices = [2145, 8616, 6241, 1632]
dataset_label = '0e_10k'
outdir = RUNS_FOLDER + os.sep + 'explore' + os.sep + 'B'
        
# NON-LOOP
if not os.path.exists(outdir):
    os.mkdir(outdir)
dataset_index = gamma_vals.index(dataset_label)
subdict = datasets[dataset_index]
multicell = subdict['multicell_template']

for agg_index in agg_indices:
    # pull relevant info from subdict
    X = subdict['data'][agg_index, :]
    step_hack = 0  # TODO care this will break if class has time-varying applied field
    multicell.graph_state_arr[:, step_hack] = X[:]
    assert np.array_equal(multicell_template.field_applied,
                          np.zeros((total_spins, multicell_template.total_steps)))
    plot_given_multicell(multicell, step_hack, agg_index, outdir)

# 3D vis attempts

In [None]:
# see https://www.kaggle.com/scratchpad/notebook163accf2b7/edit

import umap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import
import seaborn as sns
import plotly
import plotly.express as px


%matplotlib notebook

In [None]:
def umap_3d_mpl(dataset):
    """
    TOO SLOW -- use plotly instead
    """
    embedding = dataset['embedding']
    num_runs = embedding.shape[0]
    xvec = embedding[:,0]
    yvec = embedding[:,1]
    zvec = embedding[:,2]
    
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    
    marker = 'o' # '^'
    for idx in range(num_runs):
        ax.scatter(xvec[idx], yvec[idx], zvec[idx], marker=marker)

    ax.set_xlabel('u1')
    ax.set_ylabel('u2')
    ax.set_zlabel('u3')
    ax.set_title("umap_%s" % dataset['label'])

    plt.show()

    
def umap_plotly_express(data_subdict):
    """
    Supports 2D and 3D embeddings
    """
    
    num_runs = data_subdict['num_runs']
    label = data_subdict['label']
    embedding = data_subdict['embedding']
    reducer = data_subdict['reducer']
    c = data_subdict['energies'][:, 0]  # range(num_runs)

    umap_dim = embedding.shape[1]
    assert umap_dim in [2,3]
    
    if umap_dim == 2:
        df = pd.DataFrame({'index': range(num_runs),
                           'energy': c,
                           'x': embedding[:,0],
                           'y': embedding[:,1]})

        fig = px.scatter(df, x='x', y='y',
                         color='energy',
                         title='UMAP of %s dataset' % label,
                        )

    else:
        df = pd.DataFrame({'index': range(num_runs),
                       'energy': c,
                       'x': embedding[:,0],
                       'y': embedding[:,1],
                       'z': embedding[:,2]})
    
        fig = px.scatter_3d(df, x='x', y='y', z='z', 
                            color='energy',
                            title='UMAP of %s dataset' % label,
                           )
    fig.write_html("umap_plotly_%s.html" % label)
    fig.show()
    

def umap_plotly_general(dataset):
    """"
    TODO: implement this more general version of the 'express' code umap_plotly_express() -- see docs
    Supports 2D and 3D embeddings
    """
    import plotly.graph_objects as go

    # Helix equation
    t = np.linspace(0, 10, 50)
    x, y, z = np.cos(t), np.sin(t), t

    fig = go.Figure(data=[go.Scatter3d(x=x, y=y, z=z,
                                       mode='markers')])
    fig.show()

#umap_3d_plotly_generic(None)

In [None]:
for i in range(len(gamma_vals)):
    umap_plotly_express(datasets[i])
umap_plotly_express(datasets[-1])

**K means on umap to get labels for points in each cluster**

**Compare lowdim kmeans labels with statistics in original space**

In [None]:
print(kmeans_lowdim.labels_)

blue_indices = [idx for idx, i in enumerate(kmeans_lowdim.labels_) if i == 0]
red_indices = [idx for idx, i in enumerate(kmeans_lowdim.labels_) if i == 1]
fcontents['data_blue'] = X_pm1[blue_indices, :] 
fcontents['data_red'] = X_pm1[red_indices, :]

print(fcontents['data_blue'].shape)
print(fcontents['data_red'].shape)

**Testing UNIQUE flag for umap**

In [None]:
from unsupervised_helper import *

build_dimreduce_dicts = True
add_control_data = False
vis_all = True
pca_assess = False
nsubsample = None  # None or an int

# Step 0) which 'manyruns' dirs to work with
#gamma_list = [0.0, 0.05, 0.1, 0.2, 1.0, 2.0, 20.0]
gamma_list = [20.0]
#gamma_list = [20.0]
#manyruns_dirnames = ['Wrandom1_gamma%.2f_10k_fixedorder_ferro' % a for a in gamma_list]
#manyruns_dirnames = ['Wrandom0_gamma%.2f_10k_p3_M100' % a for a in gamma_list]
manyruns_dirnames = ['Wrandom1_gamma%.2f_10k_fixedorder_p3_M100' % a for a in gamma_list]

manyruns_paths = [RUNS_FOLDER + os.sep + 'multicell_manyruns' + os.sep + dirname
                  for dirname in manyruns_dirnames]

# Step 1) umap (or other dim reduction) kwargs
if any([build_dimreduce_dicts, add_control_data, vis_all, pca_assess]):
    for n_components in [2, 3]:
        #n_components = 3
        pca_kwargs = PCA_KWARGS.copy()
        pca_kwargs['n_components'] = n_components  # TODO don't need to spec 'live', can embed later?
        umap_kwargs = UMAP_KWARGS.copy()
        umap_kwargs['n_components'] = n_components  # TODO don't need to spec 'live', can embed later?
        tsne_kwargs = TSNE_KWARGS.copy()
        tsne_kwargs['n_components'] = n_components  # TODO don't need to spec 'live', can embed later?
        # modify pca settings
        # modify umap settings
        umap_kwargs['unique'] = True
        #umap_kwargs['n_neighbors'] = 100
        #umap_kwargs['min_dist'] = 0.1
        #umap_kwargs['spread'] = 1.0
        #umap_kwargs['metric'] = 'euclidean'
        # modify tsne settings
        tsne_kwargs['perplexity'] = 100

        # Modify filename suffix for dimreduce pkl and plots
        fmod = '_F=' + '+'.join(REDUCERS_TO_USE)
        fmod += '_dim%d_seed%d' % (umap_kwargs['n_components'], umap_kwargs['random_state'])
        if nsubsample is not None:
            fmod += '_nn%d' % nsubsample
        if 'umap' in REDUCERS_TO_USE:
            if umap_kwargs['metric'] != 'euclidean':
                fmod += '_%s' % umap_kwargs['metric']
            if umap_kwargs['init'] != 'spectral':
                fmod += '_%s' % umap_kwargs['init']
            if umap_kwargs['n_neighbors'] != 15:
                fmod += '_nbor%d' % umap_kwargs['n_neighbors']
            if umap_kwargs['min_dist'] != 0.1:
                fmod += '_dist%.2f' % umap_kwargs['min_dist']
            if umap_kwargs['spread'] != 1.0:
                fmod += '_spread%.2f' % umap_kwargs['spread']
            if umap_kwargs['unique']:
                fmod += '_unique'
        if 'tsne' in REDUCERS_TO_USE:
            if tsne_kwargs['perplexity'] != 30.0:
                fmod += '_perplex%.2f' % tsne_kwargs['perplexity']

        # Step 2) make/load data
        datasets = {i: {'label': manyruns_dirnames[i],
                        'path': manyruns_paths[i]}
                    for i in range(len(manyruns_dirnames))}

        for idx in range(len(manyruns_dirnames)):
            fpath = manyruns_paths[idx] + os.sep + 'dimreduce' + os.sep + 'dimreduce%s.z' % fmod
            if os.path.isfile(fpath):
                print('Exists already, loading: %s' % fpath)
                fcontents = joblib.load(fpath)  # just load file if it exists
                datasets[idx] = fcontents
            else:
                print('Dim. reduction on manyruns: %s' % manyruns_dirnames[idx])
                datasets[idx] = make_dimreduce_object(
                    datasets[idx], nsubsample=nsubsample, flag_control=False,
                    umap_kwargs=umap_kwargs, tsne_kwargs=tsne_kwargs, pca_kwargs=pca_kwargs)
                save_dimreduce_object(datasets[idx], fpath)  # save to file (joblib)

        if add_control_data:
            print('adding control data...')
            total_spins_0 = datasets[0]['total_spins']
            num_runs_0 = datasets[0]['num_runs']

            # add control data into the dict of datasets
            control_X = generate_control_data(total_spins_0, num_runs_0)
            control_folder = RUNS_FOLDER + os.sep + 'multicell_manyruns' + os.sep + 'control'
            control_fpath = control_folder + os.sep + \
                            'dimreduce' + os.sep + 'dimreduce%s.z' % fmod

            datasets[-1] = {
                'data': control_X,
                'label': 'control (coin-flips)',
                'num_runs': num_runs_0,
                'total_spins': total_spins_0,
                'energies': np.zeros((num_runs_0, 5)),
                'path': control_folder
            }
            datasets[-1] = make_dimreduce_object(
                datasets[-1], flag_control=True, nsubsample=nsubsample,
                umap_kwargs=umap_kwargs, tsne_kwargs=tsne_kwargs, pca_kwargs=pca_kwargs)
            save_dimreduce_object(datasets[-1], control_fpath)  # save to file (joblib)


**Testing plotly surface plot**

In [None]:
import plotly.graph_objects as go
import pandas as pd

# Read data from a csv
z_data = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/api_docs/mt_bruno_elevation.csv')

#print(z_data['x'])
data_top = z_data.head()  
print(data_top)
print(z_data.shape)
print(z_data.values.shape)
print(type(z_data.values))
a = z_data.values
a.reshape(1, len(a))

In [None]:
fig = go.Figure(data=[go.Surface(z=z_data.values)])

fig.update_layout(title='Mt Bruno Elevation', autosize=False,
                  width=500, height=500,
                  margin=dict(l=65, r=50, b=65, t=90))

fig.show()

**Single 2D+E landscape**

In [None]:
import pandas as pd 
import plotly.figure_factory as ff
import plotly.express as px

pick = 1

use_plotly = True

manyruns_path = manyruns_paths[pick]
print(pick, gamma_list[pick], manyruns_path)
fpath_energy = manyruns_path + os.sep + 'aggregate' + os.sep + 'X_energy_last.npz'
X_energies = np.load(fpath_energy)['arr_0'].T  # umap wants transpose (?)
energies = X_energies[:nn, 0]

X = aligned_mapper.embeddings_[pick]
print(X.shape)

print(X[2602,:])

fig = plt.figure(figsize=(10, 10))
# ===============================
if use_plotly:
    clabel = 'energies'
    df = pd.DataFrame({'index': range(nn),
                        clabel: energies,
                       'x': X[:, 0],
                       'y': X[:, 1],
                       'z': energies})

    fig = px.scatter_3d(df, x='x', y='y', z='z',
                        color=clabel,
                        title='jupyter',
                        hover_name='index')
    fig.show()
else:
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(X[:,0], X[:,1], energies, c=energies, cmap='Spectral_r')
    ax.set_xlabel('X Label')
    ax.set_ylabel('Y Label')
    ax.set_zlabel('Z Label')
    plt.show()


# Plot specific points from index

In [None]:
from multicell.unsupervised_helper import plot_given_multicell


agg_indices = [911]
outdir = RUNS_FOLDER + os.sep + 'explore' + os.sep + 'plot_specific_points'

# where is the data?
step = None
#dirname = 'Wrandom0_gamma0.20_10k_periodic_fixedorderV3_p3_M100'
dirname = 'Wrandom0_gamma1.00_10k_periodic_fixedorderV3_p3_M100'


#step = 14
#dirname = 'beta2.05_Wrandom0_gamma0.20_10k_periodic_fixedorderV3_p3_M100'

manyruns_path = RUNS_FOLDER + os.sep + 'multicell_manyruns' + os.sep + dirname
fpath_pickle = manyruns_path + os.sep + 'multicell_template.pkl'
with open(fpath_pickle, 'rb') as pickle_file:
    multicell = pickle.load(pickle_file)  # unpickling multicell object

for agg_index in agg_indices:  
    #smod = ''
    smod = '_last'
    if step is not None:
        smod = '_%d' % step
    
    agg_dir = manyruns_path + os.sep + 'aggregate'
    fpath_state = agg_dir + os.sep + 'X_aggregate%s.npz' % smod
    fpath_energy = agg_dir + os.sep + 'X_energy%s.npz' % smod
    fpath_pickle = manyruns_path + os.sep + 'multicell_template.pkl'
    print(fpath_state)
    X = np.load(fpath_state)['arr_0'].T  # umap wants transpose
    X_state = X[agg_index, :]
    
    step_hack = 0  # TODO care this will break if class has time-varying applied field
    multicell.graph_state_arr[:, step_hack] = X_state[:]
    #assert np.array_equal(multicell_template.field_applied, np.zeros((total_spins, multicell_template.total_steps)))
    plot_given_multicell(multicell, step_hack, agg_index, outdir)


**Check indiv lattice states**

In [None]:
from utils.file_io import run_subdir_setup, RUNS_FOLDER, INPUT_FOLDER
from multicell.graph_helper import state_load
from multicell.graph_adjacency import lattice_square_loc_to_int
from singlecell.singlecell_simsetup import singlecell_simsetup

replot_dir = RUNS_FOLDER + os.sep + 'explore' + os.sep + 'replot'

sidelength = 20
curated = True
random_mem = False        # TODO incorporate seed in random XI in simsetup/curated
random_W = False          # TODO incorporate seed in random W in simsetup/curated
W_override_path = INPUT_FOLDER + os.sep + 'manual_WJ' + os.sep + 'simsetup_W_9_maze.txt'
simsetup_main = singlecell_simsetup(
    unfolding=True, random_mem=random_mem, random_W=random_W, curated=curated, housekeeping=0)
if W_override_path is not None:
    print('Note: in main, overriding W from file...')
    explicit_W = np.loadtxt(W_override_path, delimiter=',')
    simsetup_main['FIELD_SEND'] = explicit_W
print("simsetup checks:")
print("\tsimsetup['N'],", simsetup_main['N'])
print("\tsimsetup['P'],", simsetup_main['P'])

In [None]:
fnames = [a for a in os.listdir(replot_dir) if a[-4:] == '.npz']
fpaths = [replot_dir + os.sep + a for a in fnames]
print(fpaths)



In [None]:
print(fpaths[9])
X = state_load(fpaths[9], cells_as_cols=True, num_genes=None, num_cells=None, txt=False)

In [None]:
loc = (0,-3)
node_idx = lattice_square_loc_to_int(loc, sidelength)
cellstate = X[:, node_idx]
print(cellstate)
print(np.dot(simsetup_main['XI'].T, cellstate)/9.0)

In [None]:
turquoise = [30, 223, 214]

white = [255,255,255]
soft_grey = [225, 220, 222]
soft_grey_alt1 = [206, 199, 182]
soft_grey_alt2 = [219, 219, 219]
beige = [250, 227, 199]

soft_blue = [148, 210, 226]
soft_blue_alt1 = [58, 128, 191]

soft_red = [192, 86, 64]
soft_red_alt1 = [240, 166, 144]
soft_red_alt2 = [255, 134, 113]

soft_yellow = [237, 209, 112]

soft_orange = [250, 173, 63]
soft_orange_alt1 = [248, 200, 140]

soft_green = [120, 194, 153]
sharp_green = [142, 200, 50]

soft_purple = [177, 156, 217]

soft_grey_norm = np.array(soft_grey) / 255.0

color_anchor_beige = np.array(beige) / 255.0
color_anchor_white = np.array(white) / 255.0
color_anchor = color_anchor_white


#color_A_pos = np.array(soft_blue_alt1) / 255.0
color_A_pos = np.array(soft_blue) / 255.0
color_A_neg = np.array(soft_orange) / 255.0

color_B_pos = np.array(soft_red) / 255.0
color_B_neg = np.array(soft_green) / 255.0

color_C_pos = np.array(soft_yellow) / 255.0
color_C_neg = np.array(soft_purple) / 255.0

def linear_interpolate(val, c2, c1=color_anchor):
    eps = 1e-4
    assert 0.0 <= val <= 1.0 + eps
    cout = c1 + val * (c2 - c1)
    return cout

In [None]:
def fill_arr_color(color):
    q = np.zeros((10,10,3))
    q[:,:,0] += color[0]
    q[:,:,1] += color[1]
    q[:,:,2] += color[2]
    return q

fig, axarr = plt.subplots(1,3)
a = np.array([color_A_pos]).reshape(1,1,3)
axarr[0].imshow(a)
b = np.array([color_B_pos]).reshape(1,1,3)
axarr[1].imshow(b)
c = np.array([color_C_pos]).reshape(1,1,3)
axarr[2].imshow(c)
plt.show()

In [None]:
fig, axarr = plt.subplots(1,3)
colour_mix = linear_interpolate(0.3, color_A_pos, c1=color_B_pos)
a = np.array([colour_mix]).reshape(1,1,3)
axarr[0].imshow(a)

colour_mix = linear_interpolate(0.3, color_A_pos, c1=color_C_pos)
b = np.array([colour_mix]).reshape(1,1,3)
axarr[1].imshow(b)

colour_mix = linear_interpolate(0.3, color_B_pos, c1=color_C_pos)
c = np.array([colour_mix]).reshape(1,1,3)
axarr[2].imshow(c)
plt.show()

In [None]:
a = np.array(soft_purple)/255.0
b = np.array(soft_green)/255.0
c = np.array(soft_orange)/255.0

amount = np.sqrt(0.11)

fig, axarr = plt.subplots(1,3)
colour_mix = linear_interpolate(amount, a, c1=color_anchor_white)
axarr[0].imshow(np.array([colour_mix]).reshape(1,1,3))

colour_mix = linear_interpolate(amount, b, c1=color_anchor_white)
axarr[1].imshow(np.array([colour_mix]).reshape(1,1,3))

colour_mix = linear_interpolate(amount, c, c1=color_anchor_white)
axarr[2].imshow(np.array([colour_mix]).reshape(1,1,3))

plt.show()

In [None]:
np.sqrt(0.2)

# Manual UMAP plots

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import pickle
import joblib
import pandas as pd
import time

import plotly
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go

import umap
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

from singlecell.singlecell_linalg import sorted_eig
from utils.file_io import RUNS_FOLDER

In [None]:
from multicell.unsupervised_helper import make_dimreduce_object, save_dimreduce_object

def plotly_express_embedding_LOCAL(data_subdict, color_by_index=False, as_landscape=False,
                             fmod='', show=False, dirpath=None, surf=False, step=None):
    """
    Supports 2D and 3D embeddings
    color_by_index: for troubleshooting, colors the points according to their array position
        if False (default), color by energy instead
    """
    # colormaps here: https://plotly.com/python/builtin-colorscales/
    fmod += '_jupyter'

    num_runs = data_subdict['num_runs']
    label = data_subdict['label']
    if dirpath is None:
        dirpath = data_subdict['path'] + os.sep + 'dimreduce'
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    smod = ''
    if step is not None:
        smod = ' (step %d)' % step

    if color_by_index:
        c = np.arange(num_runs)
        fmod += '_cIndex'
        clabel = 'index'
    else:
        c = data_subdict['energies'][:, 0]  # range(num_runs)
        clabel = 'energy'

    for key, algodict in data_subdict['algos'].items():
        algo = key
        embedding = algodict['embedding']

        n_components = embedding.shape[1]
        assert n_components in [2, 3]

        plot_title = '%s of %s dataset%s' % (algo, label, smod)
        plot_path = dirpath + os.sep + "%s_plotly_%s%s" % (algo, label, fmod)

        if not as_landscape:
            if n_components == 2:
                df = pd.DataFrame({'index': range(num_runs),
                                   clabel: c,
                                   'x': embedding[:, 0],
                                   'y': embedding[:, 1]})

                fig = px.scatter(df, x='x', y='y',
                                 color=clabel,
                                 title=plot_title,
                                 hover_name='index')
                fig.update_layout({
                    'plot_bgcolor': 'rgba(0,0,0,0)',
                    'paper_bgcolor': 'rgba(0,0,0,0)'})

            else:
                df = pd.DataFrame({'index': range(num_runs),
                                   clabel: c,
                                   'x': embedding[:, 0],
                                   'y': embedding[:, 1],
                                   'z': embedding[:, 2]})

                fig = px.scatter_3d(df, x='x', y='y', z='z',
                                    color=clabel,
                                    title=plot_title,
                                    hover_name='index')

        else:
            plot_title += ' landscape'
            plot_path += '_landscape'
            df = pd.DataFrame({'index': range(num_runs),
                               clabel: c,
                               'x': embedding[:, 0],
                               'y': embedding[:, 1],
                               'z': data_subdict['energies'][:, 0]})
            if surf:
                plot_title += ' surface'
                plot_path += 'Surf'

                # SKETCHY: assumes Z = X * Y in shape
                # - will make Z = all zeros except z_i on diag
                """
                xx = df['x']
                yy = df['y']
                zz = df['z']

                xx = xx[0:1000]
                yy = yy[0:1000]
                zz = zz[0:1000]

                zmax = np.max(zz)
                buffer = 0.1 * np.abs(zmax)
                zmax += buffer
                Z = np.zeros((xx.size, yy.size))
                np.fill_diagonal(Z, zz)

                fig = go.Figure(data=[go.Surface(
                    z=Z, x=zz, y=yy)
                ])
                fig.update_layout(title=plot_title)
                """
                # Regular trisurf approach (ugly)
                u = embedding[:, 0]
                v = embedding[:, 1]

                from scipy.spatial import Delaunay

                points2D = np.vstack([u, v]).T
                tri = Delaunay(points2D)
                simplices = tri.simplices

                fig = ff.create_trisurf(
                    x=df['x'], y=df['y'], z=df['z'],
                    colormap="Thermal",
                    simplices=simplices,
                    title=plot_title)

            else:
                fig = px.scatter_3d(df, x='x', y='y', z='z',
                                    color=clabel,
                                    title=plot_title,
                                    hover_name='index')

        fig.write_html(plot_path + '.html')
        fig.write_image(plot_path + '.png')
        if show:
            fig.show()
    return

In [None]:
# these set the defaults for modifications introduced in main
REDUCER_SEED = 100
REDUCER_COMPONENTS = 3
#REDUCERS_TO_USE = ['pca']
#REDUCERS_TO_USE = ['tsne']
REDUCERS_TO_USE = ['umap']
#REDUCERS_TO_USE = ['umap', 'tsne', 'pca']
#VALID_REDUCERS = ['umap', 'tsne', 'pca']

# see defaults: https://umap-learn.readthedocs.io/en/latest/api.html
UMAP_KWARGS = {
    'random_state': REDUCER_SEED,
    'n_components': REDUCER_COMPONENTS,
    'metric': 'euclidean',
    'init': 'spectral',
    'unique': False,
    'n_neighbors': 15,
    'min_dist': 0.1,
    'spread': 1.0,
}
TSNE_KWARGS = {
    'random_state': REDUCER_SEED,
    'n_components': REDUCER_COMPONENTS,
    'metric': 'euclidean',
    'init': 'random',
    'perplexity': 30.0,
}
PCA_KWARGS = {
    'n_components': REDUCER_COMPONENTS,
}


# main flags
build_dimreduce_dicts = True
add_control_data = False
vis_all = True
pca_assess = False
plot_specific_points = False
check_evals = False

# data process settings6
use_01 = True
jitter_scale = 0  #1e-4
nsubsample = None  # None or an int

# Step 0) which 'manyruns' dirs to work with
#gamma_list = [0.0, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10, 0.15, 0.20, 0.4, 0.6, 0.8, 0.9, 1.0, 20.0]
gamma_list = [20.0]

#gamma_list = [0.0, 0.2]
# gamma_list = [2.0, 20.0]

step_list = [None]
# step_list = [0.0, 10.0]  # list of [None] or list of steps
#step_list = [0, 1, 2, 3] + list(np.arange(4, 20, 5))
#step_list = [0, 1, 2]
#step_list = [0] + list(range(4, 30, 5))
#step_list = list(range(0, 10, 1))

#manyruns_dirnames = ['Wrandom0_gamma%.2f_10k_p3_M100' % a for a in gamma_list]
#manyruns_dirnames = ['Wrandom0_gamma%.2f_10k_fixedorderNotOrig_p3_M100' % a for a in gamma_list]
#manyruns_dirnames = ['Wrandom1_gamma%.2f_10k_fixedorder_p3_M100' % a for a in gamma_list]
manyruns_dirnames = ['Wrandom0_gamma%.2f_10k_periodic_fixedorderV3_p3_M100' % a for a in gamma_list]

manyruns_paths = [RUNS_FOLDER + os.sep + 'multicell_manyruns' + os.sep + dirname
                  for dirname in manyruns_dirnames]

# Step 1) umap (or other dim reduction) kwargs
if any([build_dimreduce_dicts, add_control_data, vis_all, pca_assess]):
    for n_components in [2]:

        for step in step_list:
            #n_components = 3
            pca_kwargs = PCA_KWARGS.copy()
            pca_kwargs['n_components'] = n_components  # TODO don't need to spec 'live', can embed later?
            umap_kwargs = UMAP_KWARGS.copy()
            umap_kwargs['n_components'] = n_components  # TODO don't need to spec 'live', can embed later?
            tsne_kwargs = TSNE_KWARGS.copy()
            tsne_kwargs['n_components'] = n_components  # TODO don't need to spec 'live', can embed later?
            # modify pca settings
            # modify umap settings
            #umap_kwargs['unique'] = True
            #umap_kwargs['n_neighbors'] = 100
            umap_kwargs['min_dist'] = 0.25
            umap_kwargs['spread'] = 1.0
            #umap_kwargs['metric'] = 'euclidean'
            # modify tsne settings
            #tsne_kwargs['perplexity'] = 100

            # Modify filename suffix for dimreduce pkl and plots
            fmod = ''
            if step is not None:
                fmod += '_step%d' % step
            fmod += '_F=' + '+'.join(REDUCERS_TO_USE)
            fmod += '_dim%d_seed%d' % (umap_kwargs['n_components'],
                                       umap_kwargs['random_state'])
            if use_01:
                fmod += '_use01'
            if nsubsample is not None:
                fmod += '_nn%d' % nsubsample
            if jitter_scale > 0:
                fmod += '_jitter%.4f' % jitter_scale
            if 'umap' in REDUCERS_TO_USE:
                if umap_kwargs['metric'] != 'euclidean':
                    fmod += '_%s' % umap_kwargs['metric']
                if umap_kwargs['init'] != 'spectral':
                    fmod += '_%s' % umap_kwargs['init']
                if umap_kwargs['n_neighbors'] != 15:
                    fmod += '_nbor%d' % umap_kwargs['n_neighbors']
                if umap_kwargs['min_dist'] != 0.1:
                    fmod += '_dist%.2f' % umap_kwargs['min_dist']
                if umap_kwargs['spread'] != 1.0:
                    fmod += '_spread%.2f' % umap_kwargs['spread']
                if umap_kwargs['unique']:
                    fmod += '_unique'
            if 'tsne' in REDUCERS_TO_USE:
                if tsne_kwargs['perplexity'] != 30.0:
                    fmod += '_perplex%.2f' % tsne_kwargs['perplexity']

            # Step 2) make/load data
            datasets = {i: {'label': manyruns_dirnames[i],
                            'path': manyruns_paths[i]}
                        for i in range(len(manyruns_dirnames))}

            for idx in range(len(manyruns_dirnames)):
                fpath = manyruns_paths[idx] + os.sep + 'dimreduce' \
                        + os.sep + 'dimreduce%s.z' % fmod
                if os.path.isfile(fpath):
                    print('Exists already, loading: %s' % fpath)
                    fcontents = joblib.load(fpath)  # just load file if it exists
                    datasets[idx] = fcontents
                else:
                    print('Dim. reduction on manyruns: %s' % manyruns_dirnames[idx])
                    datasets[idx] = make_dimreduce_object(
                        datasets[idx], nsubsample=nsubsample, flag_control=False,
                        use_01=True, jitter_scale=jitter_scale,
                        umap_kwargs=umap_kwargs, tsne_kwargs=tsne_kwargs, pca_kwargs=pca_kwargs,
                        step=step)
                    save_dimreduce_object(datasets[idx], fpath)  # save to file (joblib)

            if add_control_data:
                print('adding control data...')
                total_spins_0 = datasets[0]['total_spins']
                num_runs_0 = datasets[0]['num_runs']

                # add control data into the dict of datasets
                control_X = generate_control_data(total_spins_0, num_runs_0)
                control_folder = RUNS_FOLDER + os.sep + 'multicell_manyruns' + os.sep + 'control'
                control_fpath = control_folder + os.sep + \
                                'dimreduce' + os.sep + 'dimreduce%s.z' % fmod

                datasets[-1] = {
                    'data': control_X,
                    'label': 'control (coin-flips)',
                    'num_runs': num_runs_0,
                    'total_spins': total_spins_0,
                    'energies': np.zeros((num_runs_0, 5)),
                    'path': control_folder
                }
                datasets[-1] = make_dimreduce_object(
                    datasets[-1], flag_control=True,
                    nsubsample=nsubsample, jitter_scale=jitter_scale, use_01=use_01,
                    umap_kwargs=umap_kwargs, tsne_kwargs=tsne_kwargs, pca_kwargs=pca_kwargs)
                save_dimreduce_object(datasets[-1], control_fpath)  # save to file (joblib)

            # Step 3) vis data
            if vis_all:
                for idx in range(0, len(manyruns_dirnames)):
                    plotly_express_embedding_LOCAL(
                        datasets[idx], fmod=fmod, show=False,
                        step=step)
                    plotly_express_embedding_LOCAL(
                        datasets[idx], fmod=fmod, color_by_index=True, show=False,
                        step=step)
                    plotly_express_embedding_LOCAL(
                        datasets[idx], fmod=fmod, as_landscape=True, show=False,
                        step=step)
                    #plotly_express_embedding(
                    #    datasets[idx], fmod=fmod, as_landscape=True, show=False, surf=True)
                    if pca_assess:
                        pca_assess_dataset(datasets[idx], fmod=fmod, show=False)

                if add_control_data:
                    plotly_express_embedding_LOCAL(datasets[-1], fmod=fmod, color_by_index=True)
                    if pca_assess:
                        pca_assess_dataset(datasets[-1], fmod=fmod, show=False)

            # Step 3) plot special indices of the multicell state
            if plot_specific_points:
                #agg_indices = [2611, 2289]
                agg_indices = [481, 4774]
                outdir = RUNS_FOLDER + os.sep + 'explore' + os.sep + 'plot_specific_points'

                for idx in range(0, len(manyruns_dirnames)):

                    multicell = datasets[idx]['multicell_template']

                    for agg_index in agg_indices:
                        # pull relevant info from subdict
                        X = datasets[idx]['data'][agg_index, :]
                        step_hack = 0  # TODO care this will break if class has time-varying applied field
                        multicell.graph_state_arr[:, step_hack] = X[:]
                        #assert np.array_equal(multicell_template.field_applied, np.zeros((total_spins, multicell_template.total_steps)))
                        plot_given_multicell(multicell, step_hack, agg_index, outdir)

# Step 4) eval check of Jij
if check_evals:
    for idx, dirpath in enumerate(manyruns_paths):
        fpath_pickle = dirpath + os.sep + 'multicell_template.pkl'
        with open(fpath_pickle, 'rb') as pickle_file:
            multicell_template = pickle.load(pickle_file)  # unpickling multicell object

        J_multicell = multicell_template.matrix_J_multicell
        evals, evecs = sorted_eig(J_multicell, take_real=True)
        plt.scatter(range(len(evals)), evals)
        plt.title(r'Spectrum of $J_{\mathrm{multicell}}$ for: %s' % os.path.basename(dirpath))
        plt.xlabel('rank of $\lambda$')
        plt.ylabel('$\lambda$')
        plt.show()


In [None]:
def axis_bounds(embedding):
    left, right = embedding.T[0].min(), embedding.T[0].max()
    bottom, top = embedding.T[1].min(), embedding.T[1].max()
    adj_h, adj_v = (right - left) * 0.1, (top - bottom) * 0.1
    return [left - adj_h, right + adj_h, bottom - adj_v, top + adj_v]

num_rows = 2
num_cols = 8
fig, axs = plt.subplots(num_rows, num_cols, figsize=(15, 10))
ax_bound = axis_bounds(
    np.vstack( [datasets[i]['algos']['umap']['embedding'] for i in range(len(gamma_list))] )
)
for i, ax in enumerate(axs.flatten()):
    if i<len(gamma_list):
        print(i)
        #current_target = ordered_target[150 * i:min(ordered_target.shape[0], 150 * i + 400)]
        Xd = datasets[i]['algos']['umap']['embedding']
        energies = datasets[i]['energies'][:,0]
        print(Xd.shape)
        ax.set_title(gamma_list[i])
        #ax.scatter(Xd[:,0], Xd[:,1], s=1, cmap="Spectral")
        ax.scatter(Xd[:,0], Xd[:,1], s=1, c=energies, cmap="Spectral_r")
        ax.axis(ax_bound)
        ax.set(xticks=[], yticks=[])
    else:
        fig.delaxes(ax)
#plt.tight_layout()
#plt.show()
#plt.savefig('aligned%d_gammas%d.jpg' % (nn, len(gamma_list)))
plt.savefig('Subplots%d%s.jpg' % (len(gamma_list), fmod))
#plt.savefig('Subplots%d%s.pdf' % (len(gamma_list), fmod))
plt.show()


In [None]:
plt.figure(figsize=(3, 7))
ax = plt.gca()

i = 3
Xd = datasets[i]['algos']['umap']['embedding']
energies = datasets[i]['energies'][:,0]
print(Xd.shape)
ax.set_title(gamma_list[i])

#ax.scatter(Xd[:,0], Xd[:,1], s=1, cmap="Spectral")
#ax.scatter(Xd[:,0], Xd[:,1], s=1, c=energies, cmap="RdYlBu_r")
sc = ax.scatter(Xd[:,0], Xd[:,1], s=5, c=energies, cmap="Spectral_r", alpha=1.0, edgecolor='k', linewidths=0.0)
#cbar = plt.colorbar(sc)
cbar.ax.tick_params(size=0)

#ax.axis(ax_bound)
ax.set(xticks=[], yticks=[])
#plt.tight_layout()
#plt.savefig('Subplots%d%s.jpg' % (len(gamma_list), fmod))
#plt.savefig('Subplots%d%s.pdf' % (len(gamma_list), fmod))
plt.show()


In [None]:

num_rows = 2
num_cols = 9
fig, axs = plt.subplots(num_rows, num_cols, figsize=(20, 10))
#ax_bound = axis_bounds(
#    np.vstack( [datasets[i]['algos']['umap']['embedding'] for i in range(len(gamma_list))] )
#)


picks = list(range(len(gamma_list)))
picks = [0, 3, 4, 6, 7, 8, 9, 11, 13]

for j, ax in enumerate(axs.flatten()):
    if j < len(picks):
        print(j)
        i = picks[j]
        #current_target = ordered_target[150 * i:min(ordered_target.shape[0], 150 * i + 400)]
        Xd = datasets[i]['algos']['umap']['embedding']
        energies = datasets[i]['energies'][:,0]
        print(Xd.shape)
        ax.set_title(gamma_list[i])
        #ax.scatter(Xd[:,0], Xd[:,1], s=1, cmap="Spectral")
        ax.scatter(Xd[:,0], Xd[:,1], s=1, c=energies, cmap="Spectral_r")
        #ax.axis(ax_bound)
        ax.set(xticks=[], yticks=[])
    else:
        fig.delaxes(ax)
#plt.tight_layout()
#plt.show()
#plt.savefig('aligned%d_gammas%d.jpg' % (nn, len(gamma_list)))
plt.savefig('PicksSubplots%d%s.jpg' % (len(gamma_list), fmod), dpi=300)
#plt.savefig('Subplots%d%s.pdf' % (len(gamma_list), fmod))
plt.show()
