Testing multicell functionality



# Test with own multicell data

In [None]:
%matplotlib inline

# path hack for relative import in jupyter notebook
import os
import sys

# LIBRARY GLOBAL MODS
CELLTYPES = os.path.dirname(os.path.abspath(''))
sys.path.append(CELLTYPES)

In [None]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import pandas as pd
import umap
import time
%matplotlib inline

sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

from utils.file_io import RUNS_FOLDER

# TODO 1: bokeh plot with s%d
# TODO 2: visualize s%d, get energy of s%d

In [None]:
NOTEBOOK_OUTDIR = RUNS_FOLDER + os.sep + 'explore' + os.sep + 'nb_explore'
os.makedirs(NOTEBOOK_OUTDIR, exist_ok=True)

In [None]:
def gen_control_data(total_spins, num_runs):
    X_01 = np.random.randint(2, size=(num_runs, total_spins))
    X = X_01 * 2 - 1
    return X

In [None]:
umap_seed = 100
# see defaults: https://umap-learn.readthedocs.io/en/latest/api.html
umap_kwargs = {
    'random_state': umap_seed,
    'n_components': 3,
    'metric': 'euclidean',  
    'init': 'spectral',
    'min_dist': 0.1,
    'spread': 1.0,
}

#gamma_vals = ['0e','0.05e', '0.1e', '0.2e', '1e', '2e','20e']
gamma_vals = ['0e_10k','0.05e_10k', '0.1e_10k', '0.2e_10k', '1e_10k', '2e_10k', '20e_10k']
manyruns_datadirs = [RUNS_FOLDER + os.sep + 'multicell_manyruns_gamma%s' % gamma for gamma in gamma_vals]
datasets = {i: {'label': 'gamma_%s' % gamma_vals[i]} for i in range(len(gamma_vals))}

for idx in range(len(gamma_vals)):
    fpath_state = manyruns_datadirs[idx] + os.sep + 'aggregate' + os.sep + 'X_aggregate.npz'
    fpath_energy = manyruns_datadirs[idx] + os.sep + 'aggregate' + os.sep + 'X_energy.npz'
    fpath_pickle = manyruns_datadirs[idx] + os.sep + 'multicell_template.pkl'
    print(fpath_state)
    X = np.load(fpath_state)['arr_0'].T               # umap wants transpose
    X_energies = np.load(fpath_energy)['arr_0'].T     # umap wants transpose (?)
    with open(fpath_pickle, 'rb') as pickle_file:
        multicell_template = pickle.load(pickle_file)  # unpickling multicell object
    
    # store data and metadata in datasets object
    num_runs, total_spins = X.shape
    print(X.shape)
    datasets[idx]['data'] = X
    datasets[idx]['index'] = list(range(num_runs))
    datasets[idx]['energies'] = X_energies
    datasets[idx]['num_runs'] = num_runs
    datasets[idx]['total_spins'] = total_spins
    datasets[idx]['multicell_template'] = multicell_template
    
    # perform dimension reduction
    t1 = time.time()
    datasets[idx]['reducer'] = umap.UMAP(**umap_kwargs)
    datasets[idx]['reducer'].fit(X)
    datasets[idx]['embedding'] = datasets[idx]['reducer'].transform(X)
    print('Time to fit: %.2f sec' % (time.time() - t1))
    
    # Verify that the result of calling transform is
    # idenitical to accessing the embedding_ attribute
    assert(np.all(datasets[idx]['embedding'] == datasets[idx]['reducer'].embedding_))
    print('embedding.shape:', datasets[idx]['embedding'].shape)

In [None]:
total_spins_0 = datasets[0]['total_spins']
num_runs_0 = datasets[0]['num_runs']

# add control data into the dict of datasets
control_X = gen_control_data(total_spins_0, num_runs_0)
control_reducer = umap.UMAP(**umap_kwargs)
control_reducer.fit(control_X)
datasets[-1] = {
    'label': 'control (coin-flips)',
    'num_runs': num_runs_0,
    'total_spins': total_spins_0,
    'reducer': control_reducer,
    'embedding': control_reducer.transform(control_X),
    'energies': np.zeros((num_runs, 5)),
}

In [None]:
def plot_umap_of_data_nonBokeh(data_subdict):
    num_runs = data_subdict['num_runs']
    label = data_subdict['label']
    embedding = data_subdict['embedding']
    c = data_subdict['energies'][:, 0]  # range(num_runs)
    
    """for idx in range(5):
        calt = data_subdict['energies'][:, idx]
        print(calt.shape, np.min(calt), np.max(calt))
        plt.hist(calt)
        plt.show()"""
    
    plt.scatter(embedding[:, 0], embedding[:, 1], c=c, cmap='Spectral', s=5)
    plt.gca().set_aspect('equal', 'datalim')
    #plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
    plt.colorbar()
    plt.title('UMAP projection of the %s dataset' % label, fontsize=24)
    return

#plot_umap_of_data(datasets[0])

# Bokeh plots multicell

In [None]:
import umap.plot
import bokeh.plotting# import figure, output_file, show, output_notebook

bokeh.plotting.output_notebook()  # this will render inline jupyter notebook

In [None]:
def bokeh_plot(data_subdict, fnamemod=''):
    num_runs = data_subdict['num_runs']
    label = data_subdict['label']
    embedding = data_subdict['embedding']
    reducer = data_subdict['reducer']
    c = data_subdict['energies'][:, 0]  # range(num_runs)

    # setup hoverdata
    hover_data = pd.DataFrame({'index': range(num_runs),
                               'energy': c,
                               'x': embedding[:,0],
                               'y': embedding[:,1]})
    
    # this will load html file in new tab
    bokeh.plotting.output_file("umap_%s%s.html" % (label, fnamemod), title="umap_%s%s" % (label, fnamemod))
    
    #p = umap.plot.points(reducer, labels=c)
    #p = umap.plot.interactive(reducer, labels=c, hover_data=hover_data, point_size=2)
    
    TOOLS = "pan,wheel_zoom,box_zoom,reset,save,box_select"
    #p = figure(tools=TOOLS, output_backend="webgl")  # called within umap plot interactive probably
    
    p = umap.plot.interactive(reducer, labels=c, hover_data=hover_data, point_size=4)
    p.title.text = 'UMAP projection of the %s dataset' % label
    
    bokeh.plotting.show(p)
    return

In [None]:
def bokeh_plot_alt(data_subdict, fnamemod=''):
    num_runs = data_subdict['num_runs']
    label = data_subdict['label']
    embedding = data_subdict['embedding']
    reducer = data_subdict['reducer']
    c = data_subdict['energies'][:, 0]  # range(num_runs)
    
    # OVERLAP HACK
    multicell = data_subdict['multicell_template']
    simsetup = multicell.simsetup
    #print(simsetup['XI'])
    overlap_vec_a = np.dot(data_subdict['data'][:, 0:9], simsetup['XI']) / simsetup['XI'].shape[0]
    overlap_vec_b = np.dot(data_subdict['data'][:, 9:18], simsetup['XI']) / simsetup['XI'].shape[0]
    overlap_vec_c = np.dot(data_subdict['data'][:, 18:27], simsetup['XI']) / simsetup['XI'].shape[0]
    overlap_vec_a_string = [np.array2string(a) for a in overlap_vec_a]
    overlap_vec_b_string = [np.array2string(a) for a in overlap_vec_b]
    overlap_vec_c_string = [np.array2string(a) for a in overlap_vec_c]
    print(overlap_vec_a.shape)
    proj_choice = overlap_vec_a[:, 0]
                  
    # setup hoverdata
    hover_data = pd.DataFrame({'index': range(num_runs),
                               'energy': c,
                               'x': embedding[:,0],
                               'y': embedding[:,1],
                               'cell 0': overlap_vec_a_string,
                               'cell 1': overlap_vec_b_string,
                               'cell 2': overlap_vec_c_string
                              })
                               
    # this will load html file in new tab
    bokeh.plotting.output_file("umap_%s%s.html" % (label, fnamemod), title="umap_%s" % label)
    
    TOOLS = "pan,wheel_zoom,box_zoom,reset,save,box_select"
    p = umap.plot.interactive(reducer, labels=proj_choice, hover_data=hover_data, point_size=4)
    p.title.text = 'UMAP projection of the %s dataset' % label
    
    bokeh.plotting.show(p)
    return

In [None]:
for i in range(len(gamma_vals)):
    bokeh_plot(datasets[i])
bokeh_plot(datasets[-1])

# Adding hover pictures (see umap guide)

In [None]:
data_subdict = datasets[0]
embedding = data_subdict['embedding']

In [None]:
from io import BytesIO
from PIL import Image
import base64

In [None]:
def embeddable_image(data):
    img_data = 255 - 15 * data.astype(np.uint8)
    image = Image.fromarray(img_data, mode='L').resize((64, 64), Image.BICUBIC)
    buffer = BytesIO()
    image.save(buffer, format='png')
    for_encoding = buffer.getvalue()
    return 'data:image/png;base64,' + base64.b64encode(for_encoding).decode()

In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper
from bokeh.palettes import Spectral10

output_notebook()

In [None]:
digits_df = pd.DataFrame(embedding, columns=('x', 'y'))
digits_df['digit'] = [str(x) for x in data_subdict['energies']]
digits_df['image'] = list(map(embeddable_image, digits.images))

datasource = ColumnDataSource(digits_df)
color_mapping = CategoricalColorMapper(factors=[str(9 - x) for x in digits.target_names],
                                       palette=Spectral10)

plot_figure = figure(
    title='UMAP projection of multicell dataset',
    plot_width=600,
    plot_height=600,
    tools=('pan, wheel_zoom, reset')
)

plot_figure.add_tools(HoverTool(tooltips="""
<div>
    <div>
        <img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
    </div>
    <div>
        <span style='font-size: 16px; color: #224499'>Digit:</span>
        <span style='font-size: 18px'>@digit</span>
    </div>
</div>
"""))

plot_figure.circle(
    'x',
    'y',
    source=datasource,
    color=dict(field='digit', transform=color_mapping),
    line_alpha=0.6,
    fill_alpha=0.6,
    size=4
)
show(plot_figure)

# Visualize arbitrary point in umap

In [None]:
def plot_given_multicell(multicell, step_hack, agg_index, outdir):
    fpaths = [outdir + os.sep + a for a in
              ['agg%d_compOverlap.png' % agg_index,
               'agg%d_compProj.png' % agg_index,
               'agg%d_ref0_overlap.png' % agg_index]
          ]
    multicell.step_datadict_update_global(step_hack, fill_to_end=False)
    multicell.step_state_visualize(step_hack, fpaths=fpaths)  # visualize
    return

In [None]:
# A: [325, 269, 3918, 5329]
# B: [2145, 8616, 6241, 1632]

# CHOICES
agg_indices = [2145, 8616, 6241, 1632]
dataset_label = '0e_10k'
outdir = RUNS_FOLDER + os.sep + 'explore' + os.sep + 'B'
        
# NON-LOOP
if not os.path.exists(outdir):
    os.mkdir(outdir)
dataset_index = gamma_vals.index(dataset_label)
subdict = datasets[dataset_index]
multicell = subdict['multicell_template']

for agg_index in agg_indices:
    # pull relevant info from subdict
    X = subdict['data'][agg_index, :]
    step_hack = 0  # TODO care this will break if class has time-varying applied field
    multicell.graph_state_arr[:, step_hack] = X[:]
    assert np.array_equal(multicell_template.field_applied,
                          np.zeros((total_spins, multicell_template.total_steps)))
    plot_given_multicell(multicell, step_hack, agg_index, outdir)

# Inspecting $\gamma=0$ dual clusters

In [None]:
umap_seed = 100
spin_low = 5 * 9
spin_high = 11 * 9

#gamma_vals = ['0e','0.05e', '0.1e', '0.2e', '1e', '2e','20e', '20e_10k']
gamma_vals = ['0e_10k']
manyruns_datadirs = [RUNS_FOLDER + os.sep + 'multicell_manyruns_gamma%s' % gamma for gamma in gamma_vals]
datasets_testing = {i: {'label': 'gamma_%s' % gamma_vals[i]} for i in range(len(gamma_vals))}

for idx in range(len(gamma_vals)):
    fpath_state = manyruns_datadirs[idx] + os.sep + 'aggregate' + os.sep + 'X_aggregate.npz'
    fpath_energy = manyruns_datadirs[idx] + os.sep + 'aggregate' + os.sep + 'X_energy.npz'
    fpath_pickle = manyruns_datadirs[idx] + os.sep + 'multicell_template.pkl'
    print(fpath_state)
    X = np.load(fpath_state)['arr_0'].T               # umap wants transpose
    X_energies = np.load(fpath_energy)['arr_0'].T     # umap wants transpose (?)
    with open(fpath_pickle, 'rb') as pickle_file:
        multicell_template = pickle.load(pickle_file)  # unpickling multicell object
    
    # LOCAL SUBSAMPLING for testing only
    X = X[:, spin_low:spin_high]
    print(X.shape)
    
    # store data and metadata in datasets object
    num_runs, total_spins = X.shape
    datasets_testing[idx]['data'] = X
    datasets_testing[idx]['index'] = list(range(num_runs))
    datasets_testing[idx]['energies'] = X_energies
    datasets_testing[idx]['num_runs'] = num_runs
    datasets_testing[idx]['total_spins'] = total_spins
    datasets_testing[idx]['multicell_template'] = multicell_template
    
    # perform dimension reduction
    t1 = time.time()
    datasets_testing[idx]['reducer'] = umap.UMAP()                                #random_state=umap_seed)
    datasets_testing[idx]['reducer'].fit(X)
    datasets_testing[idx]['embedding'] = datasets_testing[idx]['reducer'].transform(X)
    print('Time to fit: %.2f sec' % (time.time() - t1))
    
    # Verify that the result of calling transform is
    # idenitical to accessing the embedding_ attribute
    assert(np.all(datasets_testing[idx]['embedding'] == datasets_testing[idx]['reducer'].embedding_))
    print('embedding.shape:', datasets_testing[idx]['embedding'].shape)

In [None]:
subdict = datasets_testing[0]
bokeh_plot_alt(subdict, fnamemod='_testing')

In [None]:
embedding = subdict['embedding']
print(embedding[7091, :])

In [None]:
multicell = subdict['multicell_template']
simsetup = multicell.simsetup
XI = simsetup['XI']
print(simsetup['XI'])

# 3D vis attempts

In [None]:
# see https://www.kaggle.com/scratchpad/notebook163accf2b7/edit

import umap
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import
import seaborn as sns
import plotly
import plotly.express as px


%matplotlib notebook

In [None]:
def umap_3d_mpl(dataset):
    """
    TOO SLOW -- use plotly instead
    """
    embedding = dataset['embedding']
    num_runs = embedding.shape[0]
    xvec = embedding[:,0]
    yvec = embedding[:,1]
    zvec = embedding[:,2]
    
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    
    marker = 'o' # '^'
    for idx in range(num_runs):
        ax.scatter(xvec[idx], yvec[idx], zvec[idx], marker=marker)

    ax.set_xlabel('u1')
    ax.set_ylabel('u2')
    ax.set_zlabel('u3')
    ax.set_title("umap_%s" % dataset['label'])

    plt.show()

    
def umap_plotly_express(data_subdict):
    """
    Supports 2D and 3D embeddings
    """
    
    num_runs = data_subdict['num_runs']
    label = data_subdict['label']
    embedding = data_subdict['embedding']
    reducer = data_subdict['reducer']
    c = data_subdict['energies'][:, 0]  # range(num_runs)

    umap_dim = embedding.shape[1]
    assert umap_dim in [2,3]
    
    if umap_dim == 2:
        df = pd.DataFrame({'index': range(num_runs),
                           'energy': c,
                           'x': embedding[:,0],
                           'y': embedding[:,1]})

        fig = px.scatter(df, x='x', y='y',
                         color='energy',
                         title='UMAP of %s dataset' % label,
                        )

    else:
        df = pd.DataFrame({'index': range(num_runs),
                       'energy': c,
                       'x': embedding[:,0],
                       'y': embedding[:,1],
                       'z': embedding[:,2]})
    
        fig = px.scatter_3d(df, x='x', y='y', z='z', 
                            color='energy',
                            title='UMAP of %s dataset' % label,
                           )
    fig.write_html("umap_plotly_%s.html" % label)
    fig.show()
    

def umap_plotly_general(dataset):
    """"
    TODO: implement this more general version of the 'express' code umap_plotly_express() -- see docs
    Supports 2D and 3D embeddings
    """
    import plotly.graph_objects as go

    # Helix equation
    t = np.linspace(0, 10, 50)
    x, y, z = np.cos(t), np.sin(t), t

    fig = go.Figure(data=[go.Scatter3d(x=x, y=y, z=z,
                                       mode='markers')])
    fig.show()

#umap_3d_plotly_generic(None)

In [None]:
for i in range(len(gamma_vals)):
    umap_plotly_express(datasets[i])
umap_plotly_express(datasets[-1])

# Investigate umap at $\gamma=0$

In [None]:
# run path blocks at top first
import joblib

#gamma0_manyruns = RUNS_FOLDER + os.sep + 'multicell_manyruns' + os.sep + 'gamma2.00_10k'
gamma0_manyruns = RUNS_FOLDER + os.sep + 'multicell_manyruns' + os.sep + 'Wrandom0_gamma0.00_10k_fixedorderNot_p3_M100'


fpath = gamma0_manyruns + os.sep + 'dimreduce' + os.sep + 'dimreduce_F=umap_dim3_seed100_use01.z'
if os.path.isfile(fpath):
    print('Exists already, loading: %s' % fpath)
    fcontents = joblib.load(fpath)  # just load file if it exists
else:
    assert 1==2
    
print(fcontents.keys())

In [None]:
print(fcontents['energies'][:,0])

In [None]:
def X_to_mirror_labels(X):
    # create mirror list -- search for pairs and call them 'A', 'B', any lone points will be called 'Z'
    # convention: if two states are MIRROR FLIPS, then
    # 'A' is the one for who spin at index 0 is +1
    # 'B' is the one for who spin at index 0 is -1
    
    up_label = 'A'
    down_label = 'B'
    none_label = 'Z'
    spin_to_letter = {1: up_label, -1: down_label}
    
    print(X.shape)
    num_runs = X.shape[0]
    mirror_labels = ['' for i in range(num_runs)]
    for idx in range(num_runs):
        
        if idx % 500 == 0:
            print('Currently at %d / %d' % (idx, num_runs))
        
        if mirror_labels[idx] in [up_label, down_label]:
            #print(idx, 'CONTINUING')
            continue
        else:
            #print(idx, 'not...')

            state_to_compare = X[idx,:]
            state_to_compare_flip = -1 * state_to_compare
            letter_choice = spin_to_letter[state_to_compare[0]]
            letter_choice_flip = spin_to_letter[-1 * state_to_compare[0]]
            
            bool_mask_cols_equal_to_candidate = np.all(X == state_to_compare, axis=1)
            bool_mask_cols_flip_to_candidate = np.all(X == state_to_compare_flip, axis=1)
            #print()
            
            if np.any(bool_mask_cols_flip_to_candidate):
                # will have A B pairs
                for a, val in enumerate(bool_mask_cols_equal_to_candidate):
                    #print(bool_mask_cols_equal_to_candidate)
                    #print(val)
                    if val:
                        #print(idx, 'set %d to %s' % (a, letter_choice))
                        mirror_labels[a] = letter_choice
                for b, val in enumerate(bool_mask_cols_flip_to_candidate):
                    if val:
                        #print(idx, 'FLIP set %d to %s' % (b, letter_choice_flip))
                        mirror_labels[b] = letter_choice_flip
            else:
                # will NOT have A B pairs
                for j, val in enumerate(bool_mask_cols_equal_to_candidate):
                    if val:
                        mirror_labels[j] = none_label
            
    return mirror_labels

fcontents['mirror_labels'] = X_to_mirror_labels(fcontents['data'])
print(fcontents['mirror_labels'])

In [None]:
X_data = fcontents['data']
X_pm1 = 2*X_data - 1
fcontents['mirror_labels'] = X_to_mirror_labels(X_pm1)
print(set(fcontents['mirror_labels']))

In [None]:
n_hist = 100
num_runs, num_spins = X_pm1.shape

state_overlap_sum = np.zeros(num_runs)
for idx in range(n_hist):
    x0 = X_pm1[idx,:]
    state_overlap = np.dot(X_pm1, x0) / float(num_spins)
    state_overlap_sum += state_overlap
    plt.hist(state_overlap, bins=100, alpha=0.4)
    
state_overlap_mean = state_overlap_sum / n_hist
plt.figure()
plt.hist(state_overlap_mean, bins=100, alpha=0.4)
plt.title('Mean overlap distribution')

In [None]:
#plt.figure()
#plt.hist(state_overlap_mean, bins=100, alpha=0.4)

**Look at overlap of all points with +1 and -1 states**

In [None]:
total_spins = X_pm1.shape[1]
magnetizations = np.dot(X_pm1, np.ones(total_spins)) / float(total_spins)

In [None]:
plt.hist(magnetizations, bins=100, alpha=0.4)
plt.title('magnetizations')

**K means on umap to get labels for points in each cluster**

In [None]:
from sklearn.cluster import KMeans

kmeans_highdim = KMeans(n_clusters=8, random_state=0).fit(X_pm1)
kmeans_lowdim = KMeans(n_clusters=8, random_state=0).fit(fcontents['algos']['umap']['embedding'])

#kmeans.labels_
#kmeans.predict([[0, 0], [12, 3]])
#kmeans.cluster_centers_
#fit_predict(X, y=None, sample_weight=None)

In [None]:
def label_to_colour(a):
    d = {0: 'blue', 1: 'red', 2:'green', 3:'pruple', 4: 'pink', 5: 'q', 6: 'qq', 7: 'qqq'}
    return d[a]

fcontents['colours_highdim'] = [label_to_colour(a) for a in kmeans_highdim.labels_]
fcontents['colours_lowdim'] = [label_to_colour(a) for a in kmeans_lowdim.labels_]

In [None]:
def umap_plotly_clustered(data_subdict):   
    
    import plotly.express as px

    num_runs = data_subdict['num_runs']
    label = data_subdict['label']
    embedding = data_subdict['algos']['umap']['embedding']
    reducer = data_subdict['algos']['umap']['reducer']
    #c = data_subdict['energies'][:, 0]  # range(num_runs)
    c = data_subdict['colours_lowdim']

    umap_dim = embedding.shape[1]
    assert umap_dim in [2,3]
    
    if umap_dim == 2:
        df = pd.DataFrame({'index': range(num_runs),
                           'c': c,
                           'x': embedding[:,0],
                           'y': embedding[:,1]})

        fig = px.scatter(df, x='x', y='y',
                         color='c',
                         title='UMAP of %s dataset' % label,
                        )

    else:
        df = pd.DataFrame({'index': range(num_runs),
                       'c': c,
                       'x': embedding[:,0],
                       'y': embedding[:,1],
                       'z': embedding[:,2]})
    
        fig = px.scatter_3d(df, x='x', y='y', z='z', 
                            color='c',
                            title='UMAP of %s dataset' % label,
                           )
    fig.write_html("umap_plotly_%s_clustered.html" % label)
    fig.show()
    
def plotly_express_embedding_clustered(data_subdict):
    """
    Supports 2D and 3D embeddings
    """
    
    import plotly.express as px


    num_runs = data_subdict['num_runs']
    label = data_subdict['label']
    dirpath = data_subdict['path'] + os.sep + 'dimreduce'
    #c = data_subdict['energies'][:, 0]  # range(num_runs)
    c = data_subdict['colours_highdim']

    
    for key, algodict in data_subdict['algos'].items():
        algo = key
        reducer = algodict['reducer']
        embedding = algodict['embedding']

        n_components = embedding.shape[1]
        assert n_components in [2, 3]

        if n_components == 2:
            df = pd.DataFrame({'index': range(num_runs),
                               'energy': c,
                               'x': embedding[:, 0],
                               'y': embedding[:, 1]})

            fig = px.scatter(df, x='x', y='y',
                             color='energy',
                             title='%s of %s dataset' % (algo, label))

        else:
            df = pd.DataFrame({'index': range(num_runs),
                               'energy': c,
                               'x': embedding[:, 0],
                               'y': embedding[:, 1],
                               'z': embedding[:, 2]})

            fig = px.scatter_3d(df, x='x', y='y', z='z',
                                color='energy',
                                title='%s of %s dataset' % (algo, label))

        if not os.path.exists(dirpath):
            os.makedirs(dirpath)
        fig.write_html(dirpath + os.sep + "%s_plotly_%s_clustered.html" % (algo, label))
        fig.show()
    return

plotly_express_embedding_clustered(fcontents)

**Compare lowdim kmeans labels with statistics in original space**

In [None]:
print(kmeans_lowdim.labels_)

blue_indices = [idx for idx, i in enumerate(kmeans_lowdim.labels_) if i == 0]
red_indices = [idx for idx, i in enumerate(kmeans_lowdim.labels_) if i == 1]
fcontents['data_blue'] = X_pm1[blue_indices, :] 
fcontents['data_red'] = X_pm1[red_indices, :]

print(fcontents['data_blue'].shape)
print(fcontents['data_red'].shape)

In [None]:
magnetizations_blue = np.dot(fcontents['data_blue'], np.ones(total_spins)) / float(total_spins)
magnetizations_red = np.dot(fcontents['data_red'], np.ones(total_spins)) / float(total_spins)

print(magnetizations_red.shape)
print(magnetizations_blue.shape)

plt.hist(magnetizations_blue, bins=150, alpha=0.4, color='blue')
plt.hist(magnetizations_red, bins=150, alpha=0.4, color='red')
plt.title('magnetizations')
#plt.ylim(0.,5)

In [None]:
print(magnetizations_blue)

In [None]:
def get_plate_statistics(plate):
    num_spins = plate.shape[0]
    #cell_reshape = plate.reshape(100, 9)
    magnetization = np.dot(plate, np.ones(num_spins))
    return magnetization

nn = 700
for idx in range(nn, nn+50):
    plate = fcontents['data_red'][idx,:]
    #plate = plate[0:300]
    stats = get_plate_statistics(plate)
    print(stats)

**Search umap hyperparameters for single clusters**

Observe that between nn 2000 and nn 5000 (subsampling the num_runs = 10,000) the split to two clusters occurs

For nn 5000, tried 4 random state ints. 3/4 led to disks, 1 led to balls.  

Increasing spread from 1.0 to 2.0 causes the split to be present at nn 2000. 


OTOH, reducing spread to 0.25 causes the clusters to merge for nn 5000.
Reducing n_neighbors to 5 (but not 10) from 15 causes same. 
Increase min_dist to 0.5 (but not 0.2) from 0.1 causes same. 


MERGING:
- reduce spread
- reduce n_neighbours
- increase min_dist

In [None]:
default_metric = 'euclidean'  # also try 'random'
default_init = 'spectral'     # also try 'random'
default_n_neighbors = 15      # also try 2 to 100
default_min_dist = 0.1
default_spread = 1.0

UMAP_KWARGS = {
    'random_state': 0,
    'n_components': 3,
    'metric': 'euclidean',
    'init': 'spectral',
    'n_neighbors': 15,
    'min_dist': 0.1,
    'spread': 1.0,
}

nn = 10000
X = fcontents['data'][0:nn, :]

# shuffling order
#np.transpose(X)
#np.random.shuffle(X)
#np.transpose(X)

print(X.shape)
reducer_alt = umap.UMAP(**UMAP_KWARGS)
reducer_alt.fit(X)
embedding_alt = reducer_alt.transform(X)

In [None]:
import plotly.express as px

c = list(range(nn))

df = pd.DataFrame({'index': range(nn),
               'c': c,
               'x': embedding_alt[:,0],
               'y': embedding_alt[:,1],
               'z': embedding_alt[:,2]})

fig = px.scatter_3d(df, x='x', y='y', z='z', 
                    color='c',
                    title='UMAP of %d dataset' % nn,
                   )
fig.write_html(fcontents['path'] + os.sep + 'dimreduce' + os.sep + "umap_plotly_gamma0_nn%d.html" % nn)
fig.show()

In [None]:
#subsample = 1000
#blue_ss = fcontents['data_blue'][0:subsample,:]
#red_ss = fcontents['data_red'][0:subsample,:]

blue_ss = fcontents['data_blue'][0:,:]
red_ss = fcontents['data_red'][0:,:]

X_blue = np.dot(blue_ss, blue_ss.T) / 900
X_red = np.dot(red_ss, red_ss.T) / 900
X_cross = np.dot(blue_ss, red_ss.T) / 900

In [None]:
print('Blue: shape', X_blue.shape)
print(np.min(X_blue.flatten()), np.max(X_blue.flatten()))
print('Red: shape', X_red.shape)
print(np.min(X_red.flatten()), np.max(X_red.flatten()))
print('Cross: shape', X_cross.shape)
print(np.min(X_cross.flatten()), np.max(X_cross.flatten()))


In [None]:
#plt.figure()
bins_cross = 185
density = True

_, bins, _ = plt.hist(X_cross.flatten(), bins=bins_cross, alpha=0.4, color='k', label='cross', density=density)
plt.hist(X_blue.flatten(), bins=bins, alpha=0.4, color='blue', label='blue', density=density)
plt.hist(X_red.flatten(), bins=bins, alpha=0.4, color='red', label='red', density=density)
plt.legend()
plt.title('Similarity element distribution')
#plt.show()

In [None]:
n, bins2, _ = plt.hist(X_blue.flatten(), bins=160, alpha=0.4, color='blue', label='blue', density=density)
plt.ylim()
plt.show()
print(n)

**Statistics of each cell type in the two clusters**

In [None]:
def spins_to_types(data_arr):
    
    A = (1,1,1, -1,-1,-1, -1,-1,-1)
    B = (-1,-1,-1, 1,1,1, -1,-1,-1)
    C = (-1,-1,-1, -1,-1,-1, 1,1,1)
    S = (1,1,1, 1,1,1, 1,1,1)
    
    mapping = {
        A: 1,
        B: 2,
        C: 3,
        S: 4,
        tuple([-a for a in A]): -1,
        tuple([-a for a in B]): -2,
        tuple([-a for a in C]): -3,
        tuple([-a for a in S]): -4
    }
    
    nruns, nspins = data_arr.shape
    ncells = int(nspins / 9)
    celltypes_arr = np.zeros((nruns, ncells))
    for i in range(nruns):
        datarow = data_arr[i,:]
        celltypes_arr[i,:] = np.array([mapping[tuple(datarow[a*9:(a+1)*9])] for a in range(ncells)])
    
    return celltypes_arr


def spins_to_mag(data_arr):
    
    A = (1,1,1, -1,-1,-1, -1,-1,-1)
    B = (-1,-1,-1, 1,1,1, -1,-1,-1)
    C = (-1,-1,-1, -1,-1,-1, 1,1,1)
    S = (1,1,1, 1,1,1, 1,1,1)
    
    q = float(1/3)
    
    mapping = {
        A: -q,
        B: -q,
        C: -q,
        S: 1,
        tuple([-a for a in A]): q,
        tuple([-a for a in B]): q,
        tuple([-a for a in C]): q,
        tuple([-a for a in S]): -1
    }
    
    nruns, nspins = data_arr.shape
    ncells = int(nspins / 9)
    celltypes_arr = np.zeros((nruns, ncells))
    for i in range(nruns):
        datarow = data_arr[i,:]
        celltypes_arr[i,:] = np.array([mapping[tuple(datarow[a*9:(a+1)*9])] for a in range(ncells)])
    
    return celltypes_arr

def spins_to_magsum(data_arr):
    
    A = (1,1,1, -1,-1,-1, -1,-1,-1)
    B = (-1,-1,-1, 1,1,1, -1,-1,-1)
    C = (-1,-1,-1, -1,-1,-1, 1,1,1)
    S = (1,1,1, 1,1,1, 1,1,1)
    
    q = float(1/3)
    
    mapping = {
        A: -q,
        B: -q,
        C: -q,
        S: 1,
        tuple([-a for a in A]): q,
        tuple([-a for a in B]): q,
        tuple([-a for a in C]): q,
        tuple([-a for a in S]): -1
    }
    
    nruns, nspins = data_arr.shape
    ncells = int(nspins / 9)
    celltypes_arr = np.zeros(nruns)
    for i in range(nruns):
        datarow = data_arr[i,:]
        celltypes_arr[i] = np.sum([mapping[tuple(datarow[a*9:(a+1)*9])] for a in range(ncells)])
    
    return celltypes_arr

In [None]:
celltypes_blue = spins_to_types(fcontents['data_blue'])
celltypes_red = spins_to_types(fcontents['data_red'])
mags_blue = spins_to_mag(fcontents['data_blue'])
mags_red = spins_to_mag(fcontents['data_red'])
mags_blue_sum = spins_to_magsum(fcontents['data_blue'])
mags_red_sum = spins_to_magsum(fcontents['data_red'])

In [None]:
bins_explicit = [a+0.5 for a in range(-5,5)]
print(bins_explicit)

plt.hist(celltypes_blue.flatten(), bins=bins_explicit)

In [None]:
plt.hist(celltypes_red.flatten(), bins=bins_explicit)

In [None]:
bins_explicit_mag = [-1.25,-0.75,-0.6,-0.1,0.1,0.6,0.75,1.25]
print(bins_explicit_mag)
plt.hist(mags_blue.flatten(), bins=bins_explicit_mag)

In [None]:
plt.hist(mags_red.flatten(), bins=bins_explicit_mag)

In [None]:
#bins_explicit_mag = [-1.25,-0.75,-0.6,-0.1,0.1,0.6,0.75,1.25]
#print(bins_explicit_mag)
plt.hist(mags_blue_sum, bins=10)

In [None]:
#bins_explicit_mag = [-1.25,-0.75,-0.6,-0.1,0.1,0.6,0.75,1.25]
#print(bins_explicit_mag)
plt.hist(mags_red_sum, bins=10)

In [None]:
celltypes_all = spins_to_types(X_pm1)

In [None]:
plt.hist(celltypes_all.flatten(), bins=bins_explicit)

In [None]:
#for idx in range(10):
    #plt.hist(celltypes_red[idx,:], bins=bins_explicit, alpha=0.1)
np.bincount(celltypes_red.flatten().astype(int))

**Explicit celltype counts by run**

In [None]:
u, c = np.unique(celltypes_blue.flatten().astype(int), return_counts=True)
print(np.stack([u, c]).T)

In [None]:
def gen_count_arr(celltypes_arr):

    count_arr = np.zeros((8, celltypes_arr.shape[0]), dtype=int)

    u0 = [-4,-3,-2,-1,1,2,3,4]
    mapping_u_to_loc = {
        -4:0,
        -3:1,
        -2:2,
        -1:3,
         1:4,
         2:5,
         3:6,
         4:7,
    }

    for idx in range(celltypes_arr.shape[0]):
        uvals, cvals = np.unique(celltypes_arr[idx,:].astype(int), return_counts=True)
        for j, u in enumerate(uvals):        
            u_to_loc = mapping_u_to_loc[u]
            count_arr[u_to_loc, idx] = cvals[j]

    print(np.mean(count_arr, axis=1))
    return count_arr

red_count_arr = gen_count_arr(celltypes_red)
blue_count_arr = gen_count_arr(celltypes_blue)
all_count_arr = gen_count_arr(celltypes_all)

In [None]:
plt.hist(red_count_arr[0,:], bins=np.arange(-0.5,101.5), color='purple', label='-S', alpha=0.5)
plt.hist(red_count_arr[7,:], bins=np.arange(-0.5,101.5), color='green', label='+S', alpha=0.5)
#plt.hist(red_count_arr[3,:], bins=np.arange(-0.5,101.5), color='orange', label='+A', alpha=0.5)
#plt.hist(red_count_arr[2,:], bins=np.arange(-0.5,101.5), color='pink', label='-A', alpha=0.5)
plt.legend()
plt.title('For RED: freq of -S vs +S cell types')
plt.show()

In [None]:
plt.hist(blue_count_arr[0,:], bins=np.arange(-0.5,101.5), color='purple', label='-S', alpha=0.5)
plt.hist(blue_count_arr[7,:], bins=np.arange(-0.5,101.5), color='green', label='+S', alpha=0.5)
#plt.hist(red_count_arr[3,:], bins=np.arange(-0.5,101.5), color='orange', label='+A', alpha=0.5)
#plt.hist(red_count_arr[2,:], bins=np.arange(-0.5,101.5), color='pink', label='-A', alpha=0.5)
plt.legend()
plt.title('For BLUE: freq of -S vs +S cell types')
plt.show()

In [None]:
plt.hist(red_count_arr[3,:], bins=np.arange(-0.5,101.5), color='purple', label='-A', alpha=0.5)
plt.hist(red_count_arr[4,:], bins=np.arange(-0.5,101.5), color='green', label='+A', alpha=0.5)
#plt.hist(red_count_arr[3,:], bins=np.arange(-0.5,101.5), color='orange', label='+A', alpha=0.5)
#plt.hist(red_count_arr[2,:], bins=np.arange(-0.5,101.5), color='pink', label='-A', alpha=0.5)
plt.legend()
plt.title('For RED: freq of -A vs +A cell types')
plt.show()

In [None]:
plt.hist(blue_count_arr[3,:], bins=np.arange(-0.5,101.5), color='purple', label='-A', alpha=0.5)
plt.hist(blue_count_arr[4,:], bins=np.arange(-0.5,101.5), color='green', label='+A', alpha=0.5)
#plt.hist(red_count_arr[3,:], bins=np.arange(-0.5,101.5), color='orange', label='+A', alpha=0.5)
#plt.hist(red_count_arr[2,:], bins=np.arange(-0.5,101.5), color='pink', label='-A', alpha=0.5)
plt.legend()
plt.title('For BLUE: freq of -A vs +A cell types')
plt.show()

In [None]:
plt.hist(blue_count_arr[2,:], bins=np.arange(-0.5,101.5), color='purple', label='-B', alpha=0.5)
plt.hist(blue_count_arr[5,:], bins=np.arange(-0.5,101.5), color='green', label='+B', alpha=0.5)
#plt.hist(red_count_arr[3,:], bins=np.arange(-0.5,101.5), color='orange', label='+A', alpha=0.5)
#plt.hist(red_count_arr[2,:], bins=np.arange(-0.5,101.5), color='pink', label='-A', alpha=0.5)
plt.legend()
plt.title('For BLUE: freq of -B vs +B cell types')
plt.show()

In [None]:
plt.hist(blue_count_arr[1,:], bins=np.arange(-0.5,101.5), color='purple', label='-C', alpha=0.5)
plt.hist(blue_count_arr[6,:], bins=np.arange(-0.5,101.5), color='green', label='+C', alpha=0.5)
#plt.hist(red_count_arr[3,:], bins=np.arange(-0.5,101.5), color='orange', label='+A', alpha=0.5)
#plt.hist(red_count_arr[2,:], bins=np.arange(-0.5,101.5), color='pink', label='-A', alpha=0.5)
plt.legend()
plt.title('For BLUE: freq of -C vs +C cell types')
plt.show()

In [None]:
plt.hist(red_count_arr[0,:] - red_count_arr[7,:], bins=59, color='grey', alpha=0.7)
#plt.hist(red_count_arr[3,:], bins=np.arange(-0.5,101.5), color='orange', label='+A', alpha=0.5)
#plt.hist(red_count_arr[2,:], bins=np.arange(-0.5,101.5), color='pink', label='-A', alpha=0.5)
plt.axvline(0, linewidth=4)
plt.legend()
plt.title('For RED: freq of difference "-S minus +S" cell types')
plt.show()

In [None]:
plt.hist(blue_count_arr[0,:] - blue_count_arr[7,:], bins=59, color='grey', alpha=0.7)
#plt.hist(red_count_arr[3,:], bins=np.arange(-0.5,101.5), color='orange', label='+A', alpha=0.5)
#plt.hist(red_count_arr[2,:], bins=np.arange(-0.5,101.5), color='pink', label='-A', alpha=0.5)
plt.axvline(0, linewidth=4)
plt.legend()
plt.title('For BLUE: freq of difference "-S minus +S" cell types')
plt.show()

In [None]:
plt.hist(all_count_arr[0,:] - all_count_arr[7,:], bins=46, color='grey', alpha=0.7)
#plt.hist(red_count_arr[3,:], bins=np.arange(-0.5,101.5), color='orange', label='+A', alpha=0.5)
#plt.hist(red_count_arr[2,:], bins=np.arange(-0.5,101.5), color='pink', label='-A', alpha=0.5)
plt.axvline(0, linewidth=4)
plt.legend()
plt.title('For BLUE: freq of difference "-S minus +S" cell types')
plt.show()

**Testing UNIQUE flag for umap**

In [None]:
from unsupervised_helper import *

build_dimreduce_dicts = True
add_control_data = False
vis_all = True
pca_assess = False
nsubsample = None  # None or an int

# Step 0) which 'manyruns' dirs to work with
#gamma_list = [0.0, 0.05, 0.1, 0.2, 1.0, 2.0, 20.0]
gamma_list = [20.0]
#gamma_list = [20.0]
#manyruns_dirnames = ['Wrandom1_gamma%.2f_10k_fixedorder_ferro' % a for a in gamma_list]
#manyruns_dirnames = ['Wrandom0_gamma%.2f_10k_p3_M100' % a for a in gamma_list]
manyruns_dirnames = ['Wrandom1_gamma%.2f_10k_fixedorder_p3_M100' % a for a in gamma_list]

manyruns_paths = [RUNS_FOLDER + os.sep + 'multicell_manyruns' + os.sep + dirname
                  for dirname in manyruns_dirnames]

# Step 1) umap (or other dim reduction) kwargs
if any([build_dimreduce_dicts, add_control_data, vis_all, pca_assess]):
    for n_components in [2, 3]:
        #n_components = 3
        pca_kwargs = PCA_KWARGS.copy()
        pca_kwargs['n_components'] = n_components  # TODO don't need to spec 'live', can embed later?
        umap_kwargs = UMAP_KWARGS.copy()
        umap_kwargs['n_components'] = n_components  # TODO don't need to spec 'live', can embed later?
        tsne_kwargs = TSNE_KWARGS.copy()
        tsne_kwargs['n_components'] = n_components  # TODO don't need to spec 'live', can embed later?
        # modify pca settings
        # modify umap settings
        umap_kwargs['unique'] = True
        #umap_kwargs['n_neighbors'] = 100
        #umap_kwargs['min_dist'] = 0.1
        #umap_kwargs['spread'] = 1.0
        #umap_kwargs['metric'] = 'euclidean'
        # modify tsne settings
        tsne_kwargs['perplexity'] = 100

        # Modify filename suffix for dimreduce pkl and plots
        fmod = '_F=' + '+'.join(REDUCERS_TO_USE)
        fmod += '_dim%d_seed%d' % (umap_kwargs['n_components'], umap_kwargs['random_state'])
        if nsubsample is not None:
            fmod += '_nn%d' % nsubsample
        if 'umap' in REDUCERS_TO_USE:
            if umap_kwargs['metric'] != 'euclidean':
                fmod += '_%s' % umap_kwargs['metric']
            if umap_kwargs['init'] != 'spectral':
                fmod += '_%s' % umap_kwargs['init']
            if umap_kwargs['n_neighbors'] != 15:
                fmod += '_nbor%d' % umap_kwargs['n_neighbors']
            if umap_kwargs['min_dist'] != 0.1:
                fmod += '_dist%.2f' % umap_kwargs['min_dist']
            if umap_kwargs['spread'] != 1.0:
                fmod += '_spread%.2f' % umap_kwargs['spread']
            if umap_kwargs['unique']:
                fmod += '_unique'
        if 'tsne' in REDUCERS_TO_USE:
            if tsne_kwargs['perplexity'] != 30.0:
                fmod += '_perplex%.2f' % tsne_kwargs['perplexity']

        # Step 2) make/load data
        datasets = {i: {'label': manyruns_dirnames[i],
                        'path': manyruns_paths[i]}
                    for i in range(len(manyruns_dirnames))}

        for idx in range(len(manyruns_dirnames)):
            fpath = manyruns_paths[idx] + os.sep + 'dimreduce' + os.sep + 'dimreduce%s.z' % fmod
            if os.path.isfile(fpath):
                print('Exists already, loading: %s' % fpath)
                fcontents = joblib.load(fpath)  # just load file if it exists
                datasets[idx] = fcontents
            else:
                print('Dim. reduction on manyruns: %s' % manyruns_dirnames[idx])
                datasets[idx] = make_dimreduce_object(
                    datasets[idx], nsubsample=nsubsample, flag_control=False,
                    umap_kwargs=umap_kwargs, tsne_kwargs=tsne_kwargs, pca_kwargs=pca_kwargs)
                save_dimreduce_object(datasets[idx], fpath)  # save to file (joblib)

        if add_control_data:
            print('adding control data...')
            total_spins_0 = datasets[0]['total_spins']
            num_runs_0 = datasets[0]['num_runs']

            # add control data into the dict of datasets
            control_X = generate_control_data(total_spins_0, num_runs_0)
            control_folder = RUNS_FOLDER + os.sep + 'multicell_manyruns' + os.sep + 'control'
            control_fpath = control_folder + os.sep + \
                            'dimreduce' + os.sep + 'dimreduce%s.z' % fmod

            datasets[-1] = {
                'data': control_X,
                'label': 'control (coin-flips)',
                'num_runs': num_runs_0,
                'total_spins': total_spins_0,
                'energies': np.zeros((num_runs_0, 5)),
                'path': control_folder
            }
            datasets[-1] = make_dimreduce_object(
                datasets[-1], flag_control=True, nsubsample=nsubsample,
                umap_kwargs=umap_kwargs, tsne_kwargs=tsne_kwargs, pca_kwargs=pca_kwargs)
            save_dimreduce_object(datasets[-1], control_fpath)  # save to file (joblib)


**Testing plotly surface plot**

In [None]:
import plotly.graph_objects as go
import pandas as pd

# Read data from a csv
z_data = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/api_docs/mt_bruno_elevation.csv')

#print(z_data['x'])
data_top = z_data.head()  
print(data_top)
print(z_data.shape)
print(z_data.values.shape)
print(type(z_data.values))
a = z_data.values
a.reshape(1, len(a))

In [None]:
fig = go.Figure(data=[go.Surface(z=z_data.values)])

fig.update_layout(title='Mt Bruno Elevation', autosize=False,
                  width=500, height=500,
                  margin=dict(l=65, r=50, b=65, t=90))

fig.show()

**Aligned umap testing**

In [None]:
import numpy as np
import sklearn.datasets
import umap
#import umap.plot
import umap.utils as utils
import umap.aligned_umap
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.express as px
import pandas as pd

In [None]:
digits = sklearn.datasets.load_digits()

In [None]:
constant_dict = {i:i for i in range(digits.data.shape[0])}
constant_relations = [constant_dict for i in range(9)]

neighbors_mapper = umap.AlignedUMAP(
    n_neighbors=[3,4,5,7,11,16,22,29,37,45,54],
    min_dist=[0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45],
    alignment_window_size=2,
    alignment_regularisation=1e-3,
).fit(
    [digits.data for i in range(10)], relations=constant_relations
)

In [None]:
n_embeddings = len(neighbors_mapper.embeddings_)
es = neighbors_mapper.embeddings_
embedding_df = pd.DataFrame(np.vstack(es), columns=('x', 'y'))
embedding_df['z'] = np.repeat(np.linspace(0, 1.0, n_embeddings), es[0].shape[0])
embedding_df['id'] = np.tile(np.arange(es[0].shape[0]), n_embeddings)
embedding_df['digit'] = np.tile(digits.target, n_embeddings)

In [None]:
import scipy.interpolate

fx = scipy.interpolate.interp1d(
    embedding_df.z[embedding_df.id == 0], embedding_df.x.values.reshape(n_embeddings, digits.data.shape[0]).T, kind="cubic"
)
fy = scipy.interpolate.interp1d(
    embedding_df.z[embedding_df.id == 0], embedding_df.y.values.reshape(n_embeddings, digits.data.shape[0]).T, kind="cubic"
)
z = np.linspace(0, 1.0, 100)

In [None]:
palette = px.colors.diverging.Spectral
interpolated_traces = [fx(z), fy(z)]
traces = [
    go.Scatter3d(
        x=interpolated_traces[0][i],
        y=interpolated_traces[1][i],
        z=z*3.0,
        mode="lines",
        line=dict(
            color=palette[digits.target[i]],
            width=3.0
        ),
        opacity=1.0,
    )
    for i in range(digits.data.shape[0])
]
fig = go.Figure(data=traces)
fig.update_layout(
    width=800,
    height=700,
    autosize=False,
    showlegend=False,
)
fig.show()

In [None]:
fig.write_html('zzz.html')


In [None]:
# alternate way, top of
# https://umap-learn.readthedocs.io/en/latest/aligned_umap_basic_usage.html

ordered_digits = digits.data[np.argsort(digits.data.sum(axis=1))]
ordered_target = digits.target[np.argsort(digits.data.sum(axis=1))]
plt.matshow(ordered_digits[-1].reshape((8,8)))

In [None]:
slices = [ordered_digits[150 * i:min(ordered_digits.shape[0], 150 * i + 400)] for i in range(10)]
relation_dict = {i+150:i for i in range(400-150)}
relation_dicts = [relation_dict.copy() for i in range(len(slices) - 1)]

In [None]:
print(len(slices))

In [None]:
print(slices[0].shape)

In [None]:
aligned_mapper = umap.AlignedUMAP().fit(slices, relations=relation_dicts)

In [None]:
def axis_bounds(embedding):
    left, right = embedding.T[0].min(), embedding.T[0].max()
    bottom, top = embedding.T[1].min(), embedding.T[1].max()
    adj_h, adj_v = (right - left) * 0.1, (top - bottom) * 0.1
    return [left - adj_h, right + adj_h, bottom - adj_v, top + adj_v]

In [None]:
fig, axs = plt.subplots(5,2, figsize=(10, 20))
ax_bound = axis_bounds(np.vstack(aligned_mapper.embeddings_))
for i, ax in enumerate(axs.flatten()):
    current_target = ordered_target[150 * i:min(ordered_target.shape[0], 150 * i + 400)]
    ax.scatter(*aligned_mapper.embeddings_[i].T, s=2, c=current_target, cmap="Spectral")
    ax.axis(ax_bound)
    ax.set(xticks=[], yticks=[])
plt.tight_layout()

**Morphing the above example to our celltype data**

In [None]:
REDUCER_SEED = 100
REDUCER_COMPONENTS = 2
UMAP_KWARGS = {
    'random_state': REDUCER_SEED,
    'n_components': REDUCER_COMPONENTS,
    'metric': 'euclidean',
    'init': 'spectral',
    'unique': False,
    'n_neighbors': 15,
    'min_dist': 0.1,
    'spread': 1.0,
}

use_01 = True

In [None]:
    # Step 0) which 'manyruns' dirs to work with
    # gamma_list = [0.0, 0.05, 0.1, 0.2, 1.0, 2.0, 20.0]
    gamma_list = [0.2]
    # step_list = [0.0, 10.0]  # list of [None] or list of steps
    step_list = [0, 5, 10,15,20,25,29]

    # gamma_list = [2.0, 20.0]
    # manyruns_dirnames = ['Wrandom1_gamma%.2f_10k_fixedorder_ferro' % a for a in gamma_list]
    # manyruns_dirnames = ['Wrandom0_gamma%.2f_10k_p3_M100' % a for a in gamma_list]
    # manyruns_dirnames = ['Wrandom0_gamma%.2f_10k_fixedorder_p3_M100' % a for a in gamma_list]
    # manyruns_dirnames = ['Wrandom1_gamma%.2f_10k_fixedorder_p3_M100' % a for a in gamma_list]

    manyruns_dirnames = ['beta2.00_Wrandom0_gamma%.2f_1k_p3_M100' % a for a in
                         gamma_list]

    manyruns_paths = [RUNS_FOLDER + os.sep + 'multicell_manyruns' + os.sep + dirname
                      for dirname in manyruns_dirnames]

    # Step 1) umap (or other dim reduction) kwargs
    n_components = 2
    nn = 1000
    X_multi = np.zeros((len(step_list), nn, 900))
    for j, step in enumerate(step_list):
        umap_kwargs = UMAP_KWARGS.copy()
        umap_kwargs[
            'n_components'] = n_components  # TODO don't need to spec 'live', can embed later?
        # modify umap settings
        # umap_kwargs['unique'] = True
        # umap_kwargs['n_neighbors'] = 100
        # umap_kwargs['min_dist'] = 0.
        # umap_kwargs['spread'] = 1.0
        # umap_kwargs['metric'] = 'euclidean'

        # Step 2) make/load data
        # ...
        
        smod = '_%d' % step
        
        manyruns_path = manyruns_paths[0]  # TODO
        agg_dir = manyruns_path + os.sep + 'aggregate'
        fpath_state = agg_dir + os.sep + 'X_aggregate%s.npz' % smod
        fpath_energy = agg_dir + os.sep + 'X_energy%s.npz' % smod
        fpath_pickle = manyruns_path + os.sep + 'multicell_template.pkl'
        print(fpath_state)
        
        X = np.load(fpath_state)['arr_0'].T  # umap wants transpose
        X_energies = np.load(fpath_energy)['arr_0'].T  # umap wants transpose (?)
        with open(fpath_pickle, 'rb') as pickle_file:
            multicell_template = pickle.load(pickle_file)  # unpickling multicell object
        
        if use_01:
            X = (1 + X) / 2.0
            X = X.astype(int)
        
        X_multi[j,:,:] = X[0:nn, :]

In [None]:
# UMAP aligned needs a relationdict for the 'time varying' dataset
# our relation is that each traj maps to itself (in time) -- constant relation

constant_dict = {i:i for i in range(900)}
constant_relations = [constant_dict for i in range(len(step_list)-1)]

In [None]:
#X_multi_as_list = [X_multi[i,:,:] for i in range(X_multi.shape[0])]
aligned_mapper = umap.AlignedUMAP().fit(X_multi, relations=constant_relations)

In [None]:
def axis_bounds(embedding):
    left, right = embedding.T[0].min(), embedding.T[0].max()
    bottom, top = embedding.T[1].min(), embedding.T[1].max()
    adj_h, adj_v = (right - left) * 0.1, (top - bottom) * 0.1
    return [left - adj_h, right + adj_h, bottom - adj_v, top + adj_v]

In [None]:
fig, axs = plt.subplots(5,2, figsize=(10, 20))
ax_bound = axis_bounds(np.vstack(aligned_mapper.embeddings_))
for i, ax in enumerate(axs.flatten()):
    print(i)
    #current_target = ordered_target[150 * i:min(ordered_target.shape[0], 150 * i + 400)]
    ax.scatter(*aligned_mapper.embeddings_[i].T, s=2, cmap="Spectral")
    #ax.scatter(*aligned_mapper.embeddings_[i].T, s=2, cmap="Spectral_r")
    ax.axis(ax_bound)
    ax.set(xticks=[], yticks=[])
plt.tight_layout()

**Allign UMAP but for Gamma (not step)**

In [None]:
REDUCER_SEED = 100
REDUCER_COMPONENTS = 2
UMAP_KWARGS = {
    'random_state': REDUCER_SEED,
    'n_components': REDUCER_COMPONENTS,
    'metric': 'euclidean',
    'init': 'spectral',
    'unique': False,
    'n_neighbors': 15,
    'min_dist': 0.1,
    'spread': 1.0,
}

use_01 = True
nn = 4000  # runs with 1000, crash with 5000, 10000 -- try to restrict to more int gammas maybe
kk = 900   # debug: subsample multicell spins to avoid memory issue

In [None]:
# Step 0) which 'manyruns' dirs to work with
# gamma_list = [0.0, 0.05, 0.1, 0.2, 1.0, 2.0, 20.0]

#gamma_list = [0.0, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10, 0.15, 0.20, 0.4, 0.6, 0.8, 0.9, 1.0, 20.0]

#gamma_list = [0.0, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10, 0.15, 0.20, 0.4, 0.6, 0.8, 0.9, 1.0, 20.0]
#gamma_list = [0.0, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10, 0.15, 0.20, 0.4, 0.6, 0.8, 1.0, 20.0]
gamma_list = [0.0, 0.02, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.15, 0.2, 0.4, 0.6, 0.8, 1.0, 2.0, 20.0]
#gamma_list = gamma_list[::-1]

#gamma_list = [0.0, 0.05]  # , 0.06]# , 0.07] # , 0.08, 0.09, 0.10] #, 0.15, 0.20, 0.4, 0.6, 0.8, 0.9, 1.0, 20.0]

# gamma_list = [2.0, 20.0]
# manyruns_dirnames = ['Wrandom1_gamma%.2f_10k_fixedorder_ferro' % a for a in gamma_list]
# manyruns_dirnames = ['Wrandom0_gamma%.2f_10k_p3_M100' % a for a in gamma_list]
# manyruns_dirnames = ['Wrandom0_gamma%.2f_10k_fixedorder_p3_M100' % a for a in gamma_list]
# manyruns_dirnames = ['Wrandom1_gamma%.2f_10k_fixedorder_p3_M100' % a for a in gamma_list]

#manyruns_dirnames = ['Wrandom0_gamma%.2f_10k_periodic_fixedorderV3_p3_M100' % a for a in
#                     gamma_list]
manyruns_dirnames = ['Wmaze15_gamma%.2f_10k_p3_M100' % a for a in
                     gamma_list]

manyruns_paths = [RUNS_FOLDER + os.sep + 'multicell_manyruns' + os.sep + dirname
                  for dirname in manyruns_dirnames]

# Step 1) umap (or other dim reduction) kwargs
n_components = 2

X_multi = np.zeros((len(gamma_list), nn, kk), dtype=int)
for j, manyruns_path in enumerate(manyruns_paths):

    gamma_val = gamma_list[j]

    umap_kwargs = UMAP_KWARGS.copy()
    umap_kwargs['n_components'] = n_components  # TODO don't need to spec 'live', can embed later?
    # modify umap settings
    # umap_kwargs['unique'] = True
    # umap_kwargs['n_neighbors'] = 100
    umap_kwargs['min_dist'] = 0.25
    # umap_kwargs['spread'] = 1.0
    # umap_kwargs['metric'] = 'euclidean'

    # Step 2) make/load data
    # ...

    smod = '_last'

    agg_dir = manyruns_path + os.sep + 'aggregate'
    fpath_state = agg_dir + os.sep + 'X_aggregate%s.npz' % smod
    fpath_energy = agg_dir + os.sep + 'X_energy%s.npz' % smod
    fpath_pickle = manyruns_path + os.sep + 'multicell_template.pkl'

    X = np.load(fpath_state)['arr_0'].T  # umap wants transpose
    X_energies = np.load(fpath_energy)['arr_0'].T  # umap wants transpose (?)
    with open(fpath_pickle, 'rb') as pickle_file:
        multicell_template = pickle.load(pickle_file)  # unpickling multicell object

    if use_01:
        X = (1 + X) / 2.0
        X = X.astype(int)

    print('accessing', j, manyruns_path)
    X_multi[j, :, :] = X[0:nn, 0:kk]

In [None]:
# UMAP aligned needs a relationdict for the 'time varying' dataset
# our relation is that each traj maps to itself (in time) -- constant relation

constant_dict = {i: i for i in range(kk)}
constant_relations = [constant_dict for i in range(len(gamma_list)-1)]

In [None]:
#X_multi_as_list = [X_multi[i,:,:] for i in range(X_multi.shape[0])]
aligned_mapper = umap.AlignedUMAP().fit(X_multi, relations=constant_relations)

In [None]:
def axis_bounds(embedding):
    left, right = embedding.T[0].min(), embedding.T[0].max()
    bottom, top = embedding.T[1].min(), embedding.T[1].max()
    adj_h, adj_v = (right - left) * 0.1, (top - bottom) * 0.1
    return [left - adj_h, right + adj_h, bottom - adj_v, top + adj_v]

In [None]:
num_rows = 4
num_cols = 4
fig, axs = plt.subplots(num_rows, num_cols, figsize=(20, 40))
ax_bound = axis_bounds(np.vstack(aligned_mapper.embeddings_))
for i, ax in enumerate(axs.flatten()):
    if i<len(gamma_list):
        #current_target = ordered_target[150 * i:min(ordered_target.shape[0], 150 * i + 400)]
        
        manyruns_path = manyruns_paths[i]
        print(i, gamma_list[i], manyruns_path)
        fpath_energy = manyruns_path + os.sep + 'aggregate' + os.sep + 'X_energy_last.npz'
        X_energies = np.load(fpath_energy)['arr_0'].T  # umap wants transpose (?)
        energies = X_energies[:nn, 0]
        
        #ax.scatter(*aligned_mapper.embeddings_[i].T, s=2, cmap="Spectral_r")
        ax.scatter(*aligned_mapper.embeddings_[i].T, s=10, c=energies, cmap="Spectral_r")
        
        ax.axis(ax_bound)
        ax.set(xticks=[], yticks=[])
plt.tight_layout()
plt.savefig('aligned%d_gammas%d.jpg' % (nn, len(gamma_list)), dpi=300)

In [None]:
picks = [14, 11, 10, 9, 7, 6, 5, 0]
picks = [0, 1]

num_rows = 1
num_cols = len(picks)

fig, axs = plt.subplots(num_rows, num_cols, figsize=(30, 10))
ax_bound = axis_bounds(np.vstack(aligned_mapper.embeddings_))
for j, ax in enumerate(axs.flatten()):
    if j<len(picks):
        
        i = picks[j]
        #current_target = ordered_target[150 * i:min(ordered_target.shape[0], 150 * i + 400)]
        
        manyruns_path = manyruns_paths[i]
        print(i, gamma_list[i], manyruns_path)
        fpath_energy = manyruns_path + os.sep + 'aggregate' + os.sep + 'X_energy_last.npz'
        X_energies = np.load(fpath_energy)['arr_0'].T  # umap wants transpose (?)
        energies = X_energies[:nn, 0]
        
        #ax.scatter(*aligned_mapper.embeddings_[i].T, s=2, cmap="Spectral_r")
        ax.set_title(gamma_list[i], fontsize=20)
        sc = ax.scatter(*aligned_mapper.embeddings_[i].T, s=10, c=energies, cmap="Spectral_r")
        
        if i == 1:
            cbar = plt.colorbar(sc)
            cbar.ax.tick_params(size=0)

        ax.axis(ax_bound)
        ax.set(xticks=[], yticks=[])
plt.tight_layout()
plt.savefig('picks_aligned%d_gammas%d.jpg' % (nn, len(gamma_list)), dpi=300)

In [None]:
#picks = [14, 11, 10, 9, 7, 6, 5, 1]
#picks = [13, 11, 10, 9, 7, 6, 2, 1]
#picks = [13, 9, 7, 6, 4, 3, 1, 0]
picks = [13, 7, 6, 5, 4, 3, 1, 0]
picks = [15, 14, 13, 7, 6, 5, 4, 3, 2, 1, 0]
#picks = [14, 11, 10, 9, 7, 6, 5, 0]
#picks = [0]

print("Max picks:", len(manyruns_paths) - 1)
print("Picks:", [gamma_list[p] for p in picks])


square = True

num_rows = 1
num_cols = len(picks)

if square:
    fig, axs = plt.subplots(num_rows, num_cols, figsize=(30, 7))  # SQUARE 30 x 5
else:
    fig, axs = plt.subplots(num_rows, num_cols, figsize=(30, 10))  # TALL 30 x 10

ax_bound = axis_bounds(np.vstack(aligned_mapper.embeddings_))
for j, ax in enumerate(axs.flatten()):
    if j<len(picks):
        
        i = picks[j]
        #current_target = ordered_target[150 * i:min(ordered_target.shape[0], 150 * i + 400)]
        
        manyruns_path = manyruns_paths[i]
        print(i, gamma_list[i], manyruns_path)
        fpath_energy = manyruns_path + os.sep + 'aggregate' + os.sep + 'X_energy_last.npz'
        X_energies = np.load(fpath_energy)['arr_0'].T  # umap wants transpose (?)
        energies = X_energies[:nn, 0]
        
        #ax.scatter(*aligned_mapper.embeddings_[i].T, s=2, cmap="Spectral_r")
        #ax.set_title(gamma_list[i], fontsize=30, y=None)
        if square:
            ax.set_title(gamma_list[i], fontsize=30, y=-0.12)
            sc = ax.scatter(*aligned_mapper.embeddings_[i].T, s=5, c=energies, cmap="Spectral_r")
        else:
            ax.set_title(gamma_list[i], fontsize=30, y=-0.06)
            sc = ax.scatter(*aligned_mapper.embeddings_[i].T, s=10, c=energies, cmap="Spectral_r")

        
        #cbar = plt.colorbar(sc)
        #cbar.ax.tick_params(size=0)

        #ax.axis(ax_bound)
        ax.set(xticks=[], yticks=[])
plt.tight_layout()
plt.savefig('picks_nobound_aligned%d_gammas%d.jpg' % (nn, len(gamma_list)), dpi=300)

In [None]:
#picks = [14, 11, 10, 9, 7, 6, 5, 1]
#picks = [14, 11, 10, 9, 7, 6, 5, 1]
picks = [13, 7, 6, 5, 4, 3, 1, 0]
#picks = [14, 11, 10, 9, 7, 6, 5, 0]
#picks = [0]

#picks = [13, 12, 11, 10, 9, 8, 7, 0]#, 4, 3, 1, 0]
#picks = picks[::-1]

square = True

num_rows = 2
num_cols = 4

if square:
    fig, axs = plt.subplots(num_rows, num_cols, figsize=(18*0.68, 12*0.68))  # SQUARE 30 x 5
else:
    fig, axs = plt.subplots(num_rows, num_cols, figsize=(30, 10))  # TALL 30 x 10

ax_bound = axis_bounds(np.vstack(aligned_mapper.embeddings_))
for j, ax in enumerate(axs.flatten()):
    if j<len(picks):
        
        i = picks[j]
        #current_target = ordered_target[150 * i:min(ordered_target.shape[0], 150 * i + 400)]
        
        manyruns_path = manyruns_paths[i]
        print(i, gamma_list[i], manyruns_path)
        fpath_energy = manyruns_path + os.sep + 'aggregate' + os.sep + 'X_energy_last.npz'
        X_energies = np.load(fpath_energy)['arr_0'].T  # umap wants transpose (?)
        energies = X_energies[:nn, 0]
        
        #ax.scatter(*aligned_mapper.embeddings_[i].T, s=2, cmap="Spectral_r")
        #ax.set_title(gamma_list[i], fontsize=30, y=None)
        if square:
            ax.set_title(gamma_list[i], fontsize=20, y=-0.12)
            sc = ax.scatter(*aligned_mapper.embeddings_[i].T, s=2, c=energies, cmap="Spectral_r")
        else:
            ax.set_title(gamma_list[i], fontsize=30, y=-0.06)
            sc = ax.scatter(*aligned_mapper.embeddings_[i].T, s=10, c=energies, cmap="Spectral_r")

        
        #cbar = plt.colorbar(sc)
        #cbar.ax.tick_params(size=0)

        #ax.axis(ax_bound)
        ax.set(xticks=[], yticks=[])
plt.tight_layout()
plt.savefig('picks_nobound_2x4_aligned%d_gammas%d.jpg' % (nn, len(gamma_list)), dpi=300)

**Single 2D figure**

In [None]:
import pandas as pd 
import plotly.figure_factory as ff
import plotly.express as px

pick = 1

use_plotly = True

manyruns_path = manyruns_paths[pick]
print(pick, gamma_list[pick], manyruns_path)
fpath_energy = manyruns_path + os.sep + 'aggregate' + os.sep + 'X_energy_last.npz'
X_energies = np.load(fpath_energy)['arr_0'].T  # umap wants transpose (?)
energies = X_energies[:nn, 0]

X = aligned_mapper.embeddings_[pick]
print(X.shape)

fig = plt.figure(figsize=(10, 10))
# ===============================
if use_plotly:
    clabel = 'energies'
    df = pd.DataFrame({'index': range(nn),
                        clabel: energies,
                       'x': X[:, 0],
                       'y': X[:, 1],
                       'z': energies})

    fig = px.scatter(df, x='x', y='y',
                     color=clabel,
                     title='jupyter',
                     hover_name='index')
    fig.write_html('simple2d_pick%d.html' % pick)
    fig.show()
else:
    ax = fig.add_subplot(111)
    #ax.scatter(X[:,0], X[:,1], c=energies, s=20, cmap='Spectral_r')
    
    #ax.scatter(X[:,0], X[:,1], c=energies, s=10, cmap='Spectral_r')
    ax.scatter(X[:,0], X[:,1], c=None, s=10, cmap='Spectral_r')

    #ax.set_xlabel('X Label')
    #ax.set_ylabel('Y Label')
    ax.set_xticks([])
    ax.set_yticks([])
    plt.savefig('ttt.jpg')
    plt.show()

**Single 2D+E landscape**

In [None]:
import pandas as pd 
import plotly.figure_factory as ff
import plotly.express as px

pick = 1

use_plotly = True

manyruns_path = manyruns_paths[pick]
print(pick, gamma_list[pick], manyruns_path)
fpath_energy = manyruns_path + os.sep + 'aggregate' + os.sep + 'X_energy_last.npz'
X_energies = np.load(fpath_energy)['arr_0'].T  # umap wants transpose (?)
energies = X_energies[:nn, 0]

X = aligned_mapper.embeddings_[pick]
print(X.shape)

print(X[2602,:])

fig = plt.figure(figsize=(10, 10))
# ===============================
if use_plotly:
    clabel = 'energies'
    df = pd.DataFrame({'index': range(nn),
                        clabel: energies,
                       'x': X[:, 0],
                       'y': X[:, 1],
                       'z': energies})

    fig = px.scatter_3d(df, x='x', y='y', z='z',
                        color=clabel,
                        title='jupyter',
                        hover_name='index')
    fig.show()
else:
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(X[:,0], X[:,1], energies, c=energies, cmap='Spectral_r')
    ax.set_xlabel('X Label')
    ax.set_ylabel('Y Label')
    ax.set_zlabel('Z Label')
    plt.show()


# Plot specific points from index

In [None]:
from multicell.unsupervised_helper import plot_given_multicell


agg_indices = [911]
outdir = RUNS_FOLDER + os.sep + 'explore' + os.sep + 'plot_specific_points'

# where is the data?
step = None
#dirname = 'Wrandom0_gamma0.20_10k_periodic_fixedorderV3_p3_M100'
dirname = 'Wrandom0_gamma1.00_10k_periodic_fixedorderV3_p3_M100'


#step = 14
#dirname = 'beta2.05_Wrandom0_gamma0.20_10k_periodic_fixedorderV3_p3_M100'

manyruns_path = RUNS_FOLDER + os.sep + 'multicell_manyruns' + os.sep + dirname
fpath_pickle = manyruns_path + os.sep + 'multicell_template.pkl'
with open(fpath_pickle, 'rb') as pickle_file:
    multicell = pickle.load(pickle_file)  # unpickling multicell object

for agg_index in agg_indices:  
    #smod = ''
    smod = '_last'
    if step is not None:
        smod = '_%d' % step
    
    agg_dir = manyruns_path + os.sep + 'aggregate'
    fpath_state = agg_dir + os.sep + 'X_aggregate%s.npz' % smod
    fpath_energy = agg_dir + os.sep + 'X_energy%s.npz' % smod
    fpath_pickle = manyruns_path + os.sep + 'multicell_template.pkl'
    print(fpath_state)
    X = np.load(fpath_state)['arr_0'].T  # umap wants transpose
    X_state = X[agg_index, :]
    
    step_hack = 0  # TODO care this will break if class has time-varying applied field
    multicell.graph_state_arr[:, step_hack] = X_state[:]
    #assert np.array_equal(multicell_template.field_applied, np.zeros((total_spins, multicell_template.total_steps)))
    plot_given_multicell(multicell, step_hack, agg_index, outdir)


**Check indiv lattice states**

In [None]:
from utils.file_io import run_subdir_setup, RUNS_FOLDER, INPUT_FOLDER
from multicell.graph_helper import state_load
from multicell.graph_adjacency import lattice_square_loc_to_int
from singlecell.singlecell_simsetup import singlecell_simsetup

replot_dir = RUNS_FOLDER + os.sep + 'explore' + os.sep + 'replot'

sidelength = 20
curated = True
random_mem = False        # TODO incorporate seed in random XI in simsetup/curated
random_W = False          # TODO incorporate seed in random W in simsetup/curated
W_override_path = INPUT_FOLDER + os.sep + 'manual_WJ' + os.sep + 'simsetup_W_9_maze.txt'
simsetup_main = singlecell_simsetup(
    unfolding=True, random_mem=random_mem, random_W=random_W, curated=curated, housekeeping=0)
if W_override_path is not None:
    print('Note: in main, overriding W from file...')
    explicit_W = np.loadtxt(W_override_path, delimiter=',')
    simsetup_main['FIELD_SEND'] = explicit_W
print("simsetup checks:")
print("\tsimsetup['N'],", simsetup_main['N'])
print("\tsimsetup['P'],", simsetup_main['P'])

In [None]:
fnames = [a for a in os.listdir(replot_dir) if a[-4:] == '.npz']
fpaths = [replot_dir + os.sep + a for a in fnames]
print(fpaths)



In [None]:
print(fpaths[9])
X = state_load(fpaths[9], cells_as_cols=True, num_genes=None, num_cells=None, txt=False)

In [None]:
loc = (0,-3)
node_idx = lattice_square_loc_to_int(loc, sidelength)
cellstate = X[:, node_idx]
print(cellstate)
print(np.dot(simsetup_main['XI'].T, cellstate)/9.0)

In [None]:
turquoise = [30, 223, 214]

white = [255,255,255]
soft_grey = [225, 220, 222]
soft_grey_alt1 = [206, 199, 182]
soft_grey_alt2 = [219, 219, 219]
beige = [250, 227, 199]

soft_blue = [148, 210, 226]
soft_blue_alt1 = [58, 128, 191]

soft_red = [192, 86, 64]
soft_red_alt1 = [240, 166, 144]
soft_red_alt2 = [255, 134, 113]

soft_yellow = [237, 209, 112]

soft_orange = [250, 173, 63]
soft_orange_alt1 = [248, 200, 140]

soft_green = [120, 194, 153]
sharp_green = [142, 200, 50]

soft_purple = [177, 156, 217]

soft_grey_norm = np.array(soft_grey) / 255.0

color_anchor_beige = np.array(beige) / 255.0
color_anchor_white = np.array(white) / 255.0
color_anchor = color_anchor_white


#color_A_pos = np.array(soft_blue_alt1) / 255.0
color_A_pos = np.array(soft_blue) / 255.0
color_A_neg = np.array(soft_orange) / 255.0

color_B_pos = np.array(soft_red) / 255.0
color_B_neg = np.array(soft_green) / 255.0

color_C_pos = np.array(soft_yellow) / 255.0
color_C_neg = np.array(soft_purple) / 255.0

def linear_interpolate(val, c2, c1=color_anchor):
    eps = 1e-4
    assert 0.0 <= val <= 1.0 + eps
    cout = c1 + val * (c2 - c1)
    return cout

In [None]:
def fill_arr_color(color):
    q = np.zeros((10,10,3))
    q[:,:,0] += color[0]
    q[:,:,1] += color[1]
    q[:,:,2] += color[2]
    return q

fig, axarr = plt.subplots(1,3)
a = np.array([color_A_pos]).reshape(1,1,3)
axarr[0].imshow(a)
b = np.array([color_B_pos]).reshape(1,1,3)
axarr[1].imshow(b)
c = np.array([color_C_pos]).reshape(1,1,3)
axarr[2].imshow(c)
plt.show()

In [None]:
fig, axarr = plt.subplots(1,3)
colour_mix = linear_interpolate(0.3, color_A_pos, c1=color_B_pos)
a = np.array([colour_mix]).reshape(1,1,3)
axarr[0].imshow(a)

colour_mix = linear_interpolate(0.3, color_A_pos, c1=color_C_pos)
b = np.array([colour_mix]).reshape(1,1,3)
axarr[1].imshow(b)

colour_mix = linear_interpolate(0.3, color_B_pos, c1=color_C_pos)
c = np.array([colour_mix]).reshape(1,1,3)
axarr[2].imshow(c)
plt.show()

In [None]:
a = np.array(soft_purple)/255.0
b = np.array(soft_green)/255.0
c = np.array(soft_orange)/255.0

amount = np.sqrt(0.11)

fig, axarr = plt.subplots(1,3)
colour_mix = linear_interpolate(amount, a, c1=color_anchor_white)
axarr[0].imshow(np.array([colour_mix]).reshape(1,1,3))

colour_mix = linear_interpolate(amount, b, c1=color_anchor_white)
axarr[1].imshow(np.array([colour_mix]).reshape(1,1,3))

colour_mix = linear_interpolate(amount, c, c1=color_anchor_white)
axarr[2].imshow(np.array([colour_mix]).reshape(1,1,3))

plt.show()

In [None]:
np.sqrt(0.2)

# Manual UMAP plots

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import pickle
import joblib
import pandas as pd
import time

import plotly
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go

import umap
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

from singlecell.singlecell_linalg import sorted_eig
from utils.file_io import RUNS_FOLDER

In [None]:
from multicell.unsupervised_helper import make_dimreduce_object, save_dimreduce_object

def plotly_express_embedding_LOCAL(data_subdict, color_by_index=False, as_landscape=False,
                             fmod='', show=False, dirpath=None, surf=False, step=None):
    """
    Supports 2D and 3D embeddings
    color_by_index: for troubleshooting, colors the points according to their array position
        if False (default), color by energy instead
    """
    # colormaps here: https://plotly.com/python/builtin-colorscales/
    fmod += '_jupyter'

    num_runs = data_subdict['num_runs']
    label = data_subdict['label']
    if dirpath is None:
        dirpath = data_subdict['path'] + os.sep + 'dimreduce'
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

    smod = ''
    if step is not None:
        smod = ' (step %d)' % step

    if color_by_index:
        c = np.arange(num_runs)
        fmod += '_cIndex'
        clabel = 'index'
    else:
        c = data_subdict['energies'][:, 0]  # range(num_runs)
        clabel = 'energy'

    for key, algodict in data_subdict['algos'].items():
        algo = key
        embedding = algodict['embedding']

        n_components = embedding.shape[1]
        assert n_components in [2, 3]

        plot_title = '%s of %s dataset%s' % (algo, label, smod)
        plot_path = dirpath + os.sep + "%s_plotly_%s%s" % (algo, label, fmod)

        if not as_landscape:
            if n_components == 2:
                df = pd.DataFrame({'index': range(num_runs),
                                   clabel: c,
                                   'x': embedding[:, 0],
                                   'y': embedding[:, 1]})

                fig = px.scatter(df, x='x', y='y',
                                 color=clabel,
                                 title=plot_title,
                                 hover_name='index')
                fig.update_layout({
                    'plot_bgcolor': 'rgba(0,0,0,0)',
                    'paper_bgcolor': 'rgba(0,0,0,0)'})

            else:
                df = pd.DataFrame({'index': range(num_runs),
                                   clabel: c,
                                   'x': embedding[:, 0],
                                   'y': embedding[:, 1],
                                   'z': embedding[:, 2]})

                fig = px.scatter_3d(df, x='x', y='y', z='z',
                                    color=clabel,
                                    title=plot_title,
                                    hover_name='index')

        else:
            plot_title += ' landscape'
            plot_path += '_landscape'
            df = pd.DataFrame({'index': range(num_runs),
                               clabel: c,
                               'x': embedding[:, 0],
                               'y': embedding[:, 1],
                               'z': data_subdict['energies'][:, 0]})
            if surf:
                plot_title += ' surface'
                plot_path += 'Surf'

                # SKETCHY: assumes Z = X * Y in shape
                # - will make Z = all zeros except z_i on diag
                """
                xx = df['x']
                yy = df['y']
                zz = df['z']

                xx = xx[0:1000]
                yy = yy[0:1000]
                zz = zz[0:1000]

                zmax = np.max(zz)
                buffer = 0.1 * np.abs(zmax)
                zmax += buffer
                Z = np.zeros((xx.size, yy.size))
                np.fill_diagonal(Z, zz)

                fig = go.Figure(data=[go.Surface(
                    z=Z, x=zz, y=yy)
                ])
                fig.update_layout(title=plot_title)
                """
                # Regular trisurf approach (ugly)
                u = embedding[:, 0]
                v = embedding[:, 1]

                from scipy.spatial import Delaunay

                points2D = np.vstack([u, v]).T
                tri = Delaunay(points2D)
                simplices = tri.simplices

                fig = ff.create_trisurf(
                    x=df['x'], y=df['y'], z=df['z'],
                    colormap="Thermal",
                    simplices=simplices,
                    title=plot_title)

            else:
                fig = px.scatter_3d(df, x='x', y='y', z='z',
                                    color=clabel,
                                    title=plot_title,
                                    hover_name='index')

        fig.write_html(plot_path + '.html')
        fig.write_image(plot_path + '.png')
        if show:
            fig.show()
    return

In [None]:
# these set the defaults for modifications introduced in main
REDUCER_SEED = 100
REDUCER_COMPONENTS = 3
#REDUCERS_TO_USE = ['pca']
#REDUCERS_TO_USE = ['tsne']
REDUCERS_TO_USE = ['umap']
#REDUCERS_TO_USE = ['umap', 'tsne', 'pca']
#VALID_REDUCERS = ['umap', 'tsne', 'pca']

# see defaults: https://umap-learn.readthedocs.io/en/latest/api.html
UMAP_KWARGS = {
    'random_state': REDUCER_SEED,
    'n_components': REDUCER_COMPONENTS,
    'metric': 'euclidean',
    'init': 'spectral',
    'unique': False,
    'n_neighbors': 15,
    'min_dist': 0.1,
    'spread': 1.0,
}
TSNE_KWARGS = {
    'random_state': REDUCER_SEED,
    'n_components': REDUCER_COMPONENTS,
    'metric': 'euclidean',
    'init': 'random',
    'perplexity': 30.0,
}
PCA_KWARGS = {
    'n_components': REDUCER_COMPONENTS,
}


# main flags
build_dimreduce_dicts = True
add_control_data = False
vis_all = True
pca_assess = False
plot_specific_points = False
check_evals = False

# data process settings6
use_01 = True
jitter_scale = 0  #1e-4
nsubsample = None  # None or an int

# Step 0) which 'manyruns' dirs to work with
#gamma_list = [0.0, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10, 0.15, 0.20, 0.4, 0.6, 0.8, 0.9, 1.0, 20.0]
gamma_list = [20.0]

#gamma_list = [0.0, 0.2]
# gamma_list = [2.0, 20.0]

step_list = [None]
# step_list = [0.0, 10.0]  # list of [None] or list of steps
#step_list = [0, 1, 2, 3] + list(np.arange(4, 20, 5))
#step_list = [0, 1, 2]
#step_list = [0] + list(range(4, 30, 5))
#step_list = list(range(0, 10, 1))

#manyruns_dirnames = ['Wrandom0_gamma%.2f_10k_p3_M100' % a for a in gamma_list]
#manyruns_dirnames = ['Wrandom0_gamma%.2f_10k_fixedorderNotOrig_p3_M100' % a for a in gamma_list]
#manyruns_dirnames = ['Wrandom1_gamma%.2f_10k_fixedorder_p3_M100' % a for a in gamma_list]
manyruns_dirnames = ['Wrandom0_gamma%.2f_10k_periodic_fixedorderV3_p3_M100' % a for a in gamma_list]

manyruns_paths = [RUNS_FOLDER + os.sep + 'multicell_manyruns' + os.sep + dirname
                  for dirname in manyruns_dirnames]

# Step 1) umap (or other dim reduction) kwargs
if any([build_dimreduce_dicts, add_control_data, vis_all, pca_assess]):
    for n_components in [2]:

        for step in step_list:
            #n_components = 3
            pca_kwargs = PCA_KWARGS.copy()
            pca_kwargs['n_components'] = n_components  # TODO don't need to spec 'live', can embed later?
            umap_kwargs = UMAP_KWARGS.copy()
            umap_kwargs['n_components'] = n_components  # TODO don't need to spec 'live', can embed later?
            tsne_kwargs = TSNE_KWARGS.copy()
            tsne_kwargs['n_components'] = n_components  # TODO don't need to spec 'live', can embed later?
            # modify pca settings
            # modify umap settings
            #umap_kwargs['unique'] = True
            #umap_kwargs['n_neighbors'] = 100
            umap_kwargs['min_dist'] = 0.25
            umap_kwargs['spread'] = 1.0
            #umap_kwargs['metric'] = 'euclidean'
            # modify tsne settings
            #tsne_kwargs['perplexity'] = 100

            # Modify filename suffix for dimreduce pkl and plots
            fmod = ''
            if step is not None:
                fmod += '_step%d' % step
            fmod += '_F=' + '+'.join(REDUCERS_TO_USE)
            fmod += '_dim%d_seed%d' % (umap_kwargs['n_components'],
                                       umap_kwargs['random_state'])
            if use_01:
                fmod += '_use01'
            if nsubsample is not None:
                fmod += '_nn%d' % nsubsample
            if jitter_scale > 0:
                fmod += '_jitter%.4f' % jitter_scale
            if 'umap' in REDUCERS_TO_USE:
                if umap_kwargs['metric'] != 'euclidean':
                    fmod += '_%s' % umap_kwargs['metric']
                if umap_kwargs['init'] != 'spectral':
                    fmod += '_%s' % umap_kwargs['init']
                if umap_kwargs['n_neighbors'] != 15:
                    fmod += '_nbor%d' % umap_kwargs['n_neighbors']
                if umap_kwargs['min_dist'] != 0.1:
                    fmod += '_dist%.2f' % umap_kwargs['min_dist']
                if umap_kwargs['spread'] != 1.0:
                    fmod += '_spread%.2f' % umap_kwargs['spread']
                if umap_kwargs['unique']:
                    fmod += '_unique'
            if 'tsne' in REDUCERS_TO_USE:
                if tsne_kwargs['perplexity'] != 30.0:
                    fmod += '_perplex%.2f' % tsne_kwargs['perplexity']

            # Step 2) make/load data
            datasets = {i: {'label': manyruns_dirnames[i],
                            'path': manyruns_paths[i]}
                        for i in range(len(manyruns_dirnames))}

            for idx in range(len(manyruns_dirnames)):
                fpath = manyruns_paths[idx] + os.sep + 'dimreduce' \
                        + os.sep + 'dimreduce%s.z' % fmod
                if os.path.isfile(fpath):
                    print('Exists already, loading: %s' % fpath)
                    fcontents = joblib.load(fpath)  # just load file if it exists
                    datasets[idx] = fcontents
                else:
                    print('Dim. reduction on manyruns: %s' % manyruns_dirnames[idx])
                    datasets[idx] = make_dimreduce_object(
                        datasets[idx], nsubsample=nsubsample, flag_control=False,
                        use_01=True, jitter_scale=jitter_scale,
                        umap_kwargs=umap_kwargs, tsne_kwargs=tsne_kwargs, pca_kwargs=pca_kwargs,
                        step=step)
                    save_dimreduce_object(datasets[idx], fpath)  # save to file (joblib)

            if add_control_data:
                print('adding control data...')
                total_spins_0 = datasets[0]['total_spins']
                num_runs_0 = datasets[0]['num_runs']

                # add control data into the dict of datasets
                control_X = generate_control_data(total_spins_0, num_runs_0)
                control_folder = RUNS_FOLDER + os.sep + 'multicell_manyruns' + os.sep + 'control'
                control_fpath = control_folder + os.sep + \
                                'dimreduce' + os.sep + 'dimreduce%s.z' % fmod

                datasets[-1] = {
                    'data': control_X,
                    'label': 'control (coin-flips)',
                    'num_runs': num_runs_0,
                    'total_spins': total_spins_0,
                    'energies': np.zeros((num_runs_0, 5)),
                    'path': control_folder
                }
                datasets[-1] = make_dimreduce_object(
                    datasets[-1], flag_control=True,
                    nsubsample=nsubsample, jitter_scale=jitter_scale, use_01=use_01,
                    umap_kwargs=umap_kwargs, tsne_kwargs=tsne_kwargs, pca_kwargs=pca_kwargs)
                save_dimreduce_object(datasets[-1], control_fpath)  # save to file (joblib)

            # Step 3) vis data
            if vis_all:
                for idx in range(0, len(manyruns_dirnames)):
                    plotly_express_embedding_LOCAL(
                        datasets[idx], fmod=fmod, show=False,
                        step=step)
                    plotly_express_embedding_LOCAL(
                        datasets[idx], fmod=fmod, color_by_index=True, show=False,
                        step=step)
                    plotly_express_embedding_LOCAL(
                        datasets[idx], fmod=fmod, as_landscape=True, show=False,
                        step=step)
                    #plotly_express_embedding(
                    #    datasets[idx], fmod=fmod, as_landscape=True, show=False, surf=True)
                    if pca_assess:
                        pca_assess_dataset(datasets[idx], fmod=fmod, show=False)

                if add_control_data:
                    plotly_express_embedding_LOCAL(datasets[-1], fmod=fmod, color_by_index=True)
                    if pca_assess:
                        pca_assess_dataset(datasets[-1], fmod=fmod, show=False)

            # Step 3) plot special indices of the multicell state
            if plot_specific_points:
                #agg_indices = [2611, 2289]
                agg_indices = [481, 4774]
                outdir = RUNS_FOLDER + os.sep + 'explore' + os.sep + 'plot_specific_points'

                for idx in range(0, len(manyruns_dirnames)):

                    multicell = datasets[idx]['multicell_template']

                    for agg_index in agg_indices:
                        # pull relevant info from subdict
                        X = datasets[idx]['data'][agg_index, :]
                        step_hack = 0  # TODO care this will break if class has time-varying applied field
                        multicell.graph_state_arr[:, step_hack] = X[:]
                        #assert np.array_equal(multicell_template.field_applied, np.zeros((total_spins, multicell_template.total_steps)))
                        plot_given_multicell(multicell, step_hack, agg_index, outdir)

# Step 4) eval check of Jij
if check_evals:
    for idx, dirpath in enumerate(manyruns_paths):
        fpath_pickle = dirpath + os.sep + 'multicell_template.pkl'
        with open(fpath_pickle, 'rb') as pickle_file:
            multicell_template = pickle.load(pickle_file)  # unpickling multicell object

        J_multicell = multicell_template.matrix_J_multicell
        evals, evecs = sorted_eig(J_multicell, take_real=True)
        plt.scatter(range(len(evals)), evals)
        plt.title(r'Spectrum of $J_{\mathrm{multicell}}$ for: %s' % os.path.basename(dirpath))
        plt.xlabel('rank of $\lambda$')
        plt.ylabel('$\lambda$')
        plt.show()


In [None]:
def axis_bounds(embedding):
    left, right = embedding.T[0].min(), embedding.T[0].max()
    bottom, top = embedding.T[1].min(), embedding.T[1].max()
    adj_h, adj_v = (right - left) * 0.1, (top - bottom) * 0.1
    return [left - adj_h, right + adj_h, bottom - adj_v, top + adj_v]

num_rows = 2
num_cols = 8
fig, axs = plt.subplots(num_rows, num_cols, figsize=(15, 10))
ax_bound = axis_bounds(
    np.vstack( [datasets[i]['algos']['umap']['embedding'] for i in range(len(gamma_list))] )
)
for i, ax in enumerate(axs.flatten()):
    if i<len(gamma_list):
        print(i)
        #current_target = ordered_target[150 * i:min(ordered_target.shape[0], 150 * i + 400)]
        Xd = datasets[i]['algos']['umap']['embedding']
        energies = datasets[i]['energies'][:,0]
        print(Xd.shape)
        ax.set_title(gamma_list[i])
        #ax.scatter(Xd[:,0], Xd[:,1], s=1, cmap="Spectral")
        ax.scatter(Xd[:,0], Xd[:,1], s=1, c=energies, cmap="Spectral_r")
        ax.axis(ax_bound)
        ax.set(xticks=[], yticks=[])
    else:
        fig.delaxes(ax)
#plt.tight_layout()
#plt.show()
#plt.savefig('aligned%d_gammas%d.jpg' % (nn, len(gamma_list)))
plt.savefig('Subplots%d%s.jpg' % (len(gamma_list), fmod))
#plt.savefig('Subplots%d%s.pdf' % (len(gamma_list), fmod))
plt.show()


In [None]:
plt.figure(figsize=(3, 7))
ax = plt.gca()

i = 3
Xd = datasets[i]['algos']['umap']['embedding']
energies = datasets[i]['energies'][:,0]
print(Xd.shape)
ax.set_title(gamma_list[i])

#ax.scatter(Xd[:,0], Xd[:,1], s=1, cmap="Spectral")
#ax.scatter(Xd[:,0], Xd[:,1], s=1, c=energies, cmap="RdYlBu_r")
sc = ax.scatter(Xd[:,0], Xd[:,1], s=5, c=energies, cmap="Spectral_r", alpha=1.0, edgecolor='k', linewidths=0.0)
#cbar = plt.colorbar(sc)
cbar.ax.tick_params(size=0)

#ax.axis(ax_bound)
ax.set(xticks=[], yticks=[])
#plt.tight_layout()
#plt.savefig('Subplots%d%s.jpg' % (len(gamma_list), fmod))
#plt.savefig('Subplots%d%s.pdf' % (len(gamma_list), fmod))
plt.show()


In [None]:

num_rows = 2
num_cols = 9
fig, axs = plt.subplots(num_rows, num_cols, figsize=(20, 10))
#ax_bound = axis_bounds(
#    np.vstack( [datasets[i]['algos']['umap']['embedding'] for i in range(len(gamma_list))] )
#)


picks = list(range(len(gamma_list)))
picks = [0, 3, 4, 6, 7, 8, 9, 11, 13]

for j, ax in enumerate(axs.flatten()):
    if j < len(picks):
        print(j)
        i = picks[j]
        #current_target = ordered_target[150 * i:min(ordered_target.shape[0], 150 * i + 400)]
        Xd = datasets[i]['algos']['umap']['embedding']
        energies = datasets[i]['energies'][:,0]
        print(Xd.shape)
        ax.set_title(gamma_list[i])
        #ax.scatter(Xd[:,0], Xd[:,1], s=1, cmap="Spectral")
        ax.scatter(Xd[:,0], Xd[:,1], s=1, c=energies, cmap="Spectral_r")
        #ax.axis(ax_bound)
        ax.set(xticks=[], yticks=[])
    else:
        fig.delaxes(ax)
#plt.tight_layout()
#plt.show()
#plt.savefig('aligned%d_gammas%d.jpg' % (nn, len(gamma_list)))
plt.savefig('PicksSubplots%d%s.jpg' % (len(gamma_list), fmod), dpi=300)
#plt.savefig('Subplots%d%s.pdf' % (len(gamma_list), fmod))
plt.show()
