# C33 UMAP Visualisation of 300K 

UMAP with different hyper params 

## Imports

In [1]:
import sys 
import os
import pathlib
import time
print(sys.version)

3.6.9 (default, Jul  3 2019, 15:36:16) 
[GCC 5.4.0 20160609]


In [2]:
utils_path = pathlib.Path(os.getcwd() + '/utils')  # i suspect this one is not needed
print(utils_path.exists())
print(os.getcwd())
#sys.path.append(str(utils_path))  # may not be necessary
#sys.path.append(os.getcwd())  # i thnk this is the one that works 
sys.path.append('../') # this one is one level up so we can see the utils lib
print(sys.path)

True
/home/mutaz/ilab1
['/usr/lib/python36.zip', '/usr/lib/python3.6', '/usr/lib/python3.6/lib-dynload', '', '/home/mutaz/.local/lib/python3.6/site-packages', '/usr/local/lib/python3.6/dist-packages', '/usr/lib/python3/dist-packages', '/home/mutaz/.local/lib/python3.6/site-packages/IPython/extensions', '/home/mutaz/.ipython', '../']


In [3]:
import numpy as np
import sklearn
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from utils.data import Data
from utils.config import Config

In [4]:
import umap
import numba

## Read Data

In [5]:
d = Data()
# v2 = True: loading the dataset with the con flag
df = d.get300K_features(v2=True)

set index
time to load 17.78


### get descriptors column names

In [6]:
feature_cols = d.getDescriptorsColumnNames_C33()

## sampling bilayers

In [7]:
n = df.shape[0]
#n = 50000
df_features = df.sample(n=n)

#df_features.C33
#df_features.commensurate
#df_features[['C33','commensurate']]

# UMAP

## umap_fit function

In [8]:
# call with df_features[feature_cols]
# labels_df has two columns C33 and communsurate 

def umap_fit(df, labels_df, 
             plot_df_filename, 
             index_name='uid', 
             n_neighbors = 15, 
             n_components = 2, 
             min_dist = 0.1, 
             metric = 'euclidean'):
    
    reducer = umap.UMAP(n_neighbors=n_neighbors,
                        n_components=n_components,
                        min_dist=min_dist,
                        metric=metric,
                        random_state=50
                       )
    
    embeddings = reducer.fit_transform(df)
    embeddings_df = pd.DataFrame(embeddings, columns={'x','y'})
    label1_name = labels_df.columns[0]
    label2_name = labels_df.columns[1]
    
    plot_df = pd.concat([embeddings_df, labels_df], 
              axis=1, 
              ignore_index=False)
    
    plot_df.set_index(index_name, inplace=True)
    
    plot_df.to_csv(Config().get_datapath(plot_df_filename))
    
    return(plot_df)

### Test with a small sample

In [None]:
n = df.shape[0]
neighbors = 40
components = 2

df_features = df.sample(n=n)
filename_pattern = 'umap_300Kdf_{}_{}_{}_{}.csv'
filename = filename_pattern.format('C33',n,neighbors, components)
plot_df = umap_fit(df=df_features[feature_cols],
                   labels_df=df_features[['C33','commensurate']].reset_index(), 
                   plot_df_filename=filename,
                   n_neighbors=neighbors, 
                   n_components=components)

### How to cbind 

``` {python}
embeddings = [[1,2], [3,4], [5,6]]
embeddings_df = pd.DataFrame(embeddings, columns={'x','y'})
labels_df = pd.DataFrame({ 'C33':[.1, .2, .3], 'comm':[True, False, True]})
plot_df = pd.concat([embeddings_df, labels_df], axis=1, ignore_index=False)
plot_df
print(embeddings_df)
print(labels_df)
print(plot_df)
```

# Generate Umap plot dfs 

In [11]:
n = df.shape[0]
neighbors_list = [5,15,25,30, 50]
components = 2

df_features = df.sample(n=n)
filename_pattern = 'umap_300Kdf_{}_{}_{}_{}.csv'

In [None]:
for neighbors in neighbors_list: 
    time_start = time.time()
    filename = filename_pattern.format('C33',n,neighbors, components)
    print(f'prep for filename: {filename}')
    plot_df = umap_fit(df=df_features[feature_cols],
                   labels_df=df_features[['C33','commensurate']].reset_index(), 
                   plot_df_filename=filename,
                   n_neighbors=neighbors, 
                   n_components=components)
    print('for neighbors: {}. {:.3f} secs'.format(neighbors,time.time()-time_start))

    

prep for filename: umap_300Kdf_C33_296835_5_2.csv


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../.local/lib/python3.6/site-packages/umap/rp_tree.py", line 135:
@numba.njit(fastmath=True, nogil=True, parallel=True)
def euclidean_random_projection_split(data, indices, rng_state):
^

  state.func_ir.loc))
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../.local/lib/python3.6/site-packages/umap/utils.py", line 409:
@numba.njit(parallel=True)
def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):
^

  current_graph, n_vertices, n_neighbors, max_candidates, rng_state
The keyword argument 'p