In [1]:
#| default_exp dr

In [2]:
#| hide
%load_ext autoreload
%autoreload 2

# Dimensionality reduction

In [3]:
#| export
import umap
import cudf
import cuml
import pandas as pd
import numpy as np
from fastcore.all import *
from dvats.imports import *
from dvats.load import TSArtifact
from dvats.memory import *

In [4]:
#| export
def check_compatibility(dr_ar:TSArtifact, enc_ar:TSArtifact):
    "Function to check that the artifact used by the encoder model and the artifact that is \
    going to be passed through the DR are compatible"
    try:
        # Check that both artifacts have the same variables
        chk_vars = dr_ar.metadata['TS']['vars'] == enc_ar.metadata['TS']['vars']
        # Check that both artifacts have the same freq
        chk_freq = dr_ar.metadata['TS']['freq'] == enc_ar.metadata['TS']['freq']
        # Check that the dr artifact is not normalized (not normalized data has not the key normalization)
        chk_norm = dr_ar.metadata['TS'].get('normalization') is None
        # Check that the dr artifact has not missing values
        chk_miss = dr_ar.metadata['TS']['has_missing_values'] == "False"
        # Check all logical vars.
        if chk_vars and chk_freq and chk_norm and chk_miss:
            print("Artifacts are compatible.")
        else:
            raise Exception
    except Exception as e:
        print("Artifacts are not compatible.")
        raise e
    return None

## Get projections (UMAP, T-SNET, PCA)

In [5]:
#| export
#Comment this part after 4_seconds debugged
import hashlib

### Get UMAP projections

In [6]:
#| export
import warnings
import sys
from numba.core.errors import NumbaPerformanceWarning
@delegates(cuml.UMAP)
def get_UMAP_prjs(
    input_data, 
    cpu=True, 
    verbose = 0, 
    check_memory_usage = True,
    **kwargs
):
    "Compute the projections of `input_data` using UMAP, with a configuration contained in `**kwargs`."
    if verbose > 0: 
        print("--> get_UMAP_prjs")
        print("kwargs: ", kwargs)
        sys.stdout.flush()
        ####
        checksum = hashlib.md5(input_data.tobytes()).hexdigest()
        print(checksum)
        ####
        
    if check_memory_usage: gpu_memory_status()
    
    warnings.filterwarnings("ignore", category=NumbaPerformanceWarning) # silence NumbaPerformanceWarning
    
    #reducer = umap.UMAP(**kwargs) if cpu else cuml.UMAP(**kwargs)
    if cpu:
        print("-- umap.UMAP --", cpu)
        sys.stdout.flush()
        reducer = umap.UMAP(**kwargs)
    else:
        print("-- cuml.UMAP --", cpu)
        sys.stdout.flush()
        if 'random_state' in kwargs:
            kwargs['random_state'] = np.uint64(kwargs['random_state'])
        reducer = cuml.UMAP(**kwargs)
    
    if verbose > 0:
        print("------- reducer --------")
        print(reducer)
        print(reducer.get_params())
        print("------- reducer --------")
        sys.stdout.flush()
    
    projections = reducer.fit_transform(input_data)
    
    if check_memory_usage: gpu_memory_status()
    if verbose > 0:
        checksum = hashlib.md5(projections.tobytes()).hexdigest()
        print("prjs checksum ", checksum)
        print("get_UMAP_prjs -->")
        sys.stdout.flush()
    return projections

In [7]:
#| slow
foo = np.random.rand(5, 10)
bar = get_UMAP_prjs(
    foo, 
    cpu=False, 
    verbose = 1,
    check_memory_usage = True,
    random_state = 1234, #822569775
    n_neighbors=3, 
    min_dist=0.1
)
test_eq(bar.shape, (foo.shape[0], 2))

--> get_UMAP_prjs
kwargs:  {'random_state': 1234, 'n_neighbors': 3, 'min_dist': 0.1}
c59260215830587675990cdee9ac5067
GPU | Used mem: 1
GPU | Used mem: 24
GPU | Memory Usage: [[90m--------------------[0m] [90m4%[0m
-- cuml.UMAP -- False
------- reducer --------
UMAP()
{'handle': <pylibraft.common.handle.Handle object at 0x7f0017675b30>, 'verbose': 4, 'output_type': 'input', 'n_neighbors': 3, 'n_components': 2, 'n_epochs': None, 'learning_rate': 1.0, 'min_dist': 0.1, 'spread': 1.0, 'set_op_mix_ratio': 1.0, 'local_connectivity': 1.0, 'repulsion_strength': 1.0, 'negative_sample_rate': 5, 'transform_queue_size': 4.0, 'init': 'spectral', 'a': 1.5769434601962196, 'b': 0.8950608779914887, 'target_n_neighbors': -1, 'target_weight': 0.5, 'target_metric': 'categorical', 'hash_input': False, 'random_state': 1234, 'callback': None, 'metric': 'euclidean', 'metric_kwds': None, 'precomputed_knn': None}
------- reducer --------
GPU | Used mem: 1
GPU | Used mem: 24
GPU | Memory Usage: [[90m-------

In [8]:
#| slow
foo = np.random.rand(5, 10)
bar = get_UMAP_prjs(
    foo, 
    cpu=True, 
    verbose = 1,
    check_memory_usage = True,
    n_neighbors=3, 
    min_dist=0.1
)
test_eq(bar.shape, (foo.shape[0], 2))

--> get_UMAP_prjs
kwargs:  {'n_neighbors': 3, 'min_dist': 0.1}
f6995d03566192f86db1486ecbc4441d
GPU | Used mem: 1
GPU | Used mem: 24
GPU | Memory Usage: [[90m--------------------[0m] [90m4%[0m
-- umap.UMAP -- True
------- reducer --------
UMAP(n_neighbors=3)
{'a': None, 'angular_rp_forest': False, 'b': None, 'dens_frac': 0.3, 'dens_lambda': 2.0, 'dens_var_shift': 0.1, 'densmap': False, 'disconnection_distance': None, 'force_approximation_algorithm': False, 'init': 'spectral', 'learning_rate': 1.0, 'local_connectivity': 1.0, 'low_memory': True, 'metric': 'euclidean', 'metric_kwds': None, 'min_dist': 0.1, 'n_components': 2, 'n_epochs': None, 'n_jobs': -1, 'n_neighbors': 3, 'negative_sample_rate': 5, 'output_dens': False, 'output_metric': 'euclidean', 'output_metric_kwds': None, 'precomputed_knn': (None, None, None), 'random_state': None, 'repulsion_strength': 1.0, 'set_op_mix_ratio': 1.0, 'spread': 1.0, 'target_metric': 'categorical', 'target_metric_kwds': None, 'target_n_neighbors':

If you want to have consistent results across executions, use `random_state`

In [9]:
#| hide
bar = get_UMAP_prjs(foo, cpu=True, n_neighbors=3, random_state=1234)
baz = get_UMAP_prjs(foo, cpu=True, n_neighbors=3, random_state=1234)
test_eq(bar, baz)

GPU | Used mem: 1
GPU | Used mem: 24
GPU | Memory Usage: [[90m--------------------[0m] [90m4%[0m
-- umap.UMAP -- True


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


GPU | Used mem: 1
GPU | Used mem: 24
GPU | Memory Usage: [[90m--------------------[0m] [90m4%[0m
GPU | Used mem: 1
GPU | Used mem: 24
GPU | Memory Usage: [[90m--------------------[0m] [90m4%[0m
-- umap.UMAP -- True


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


GPU | Used mem: 1
GPU | Used mem: 24
GPU | Memory Usage: [[90m--------------------[0m] [90m4%[0m


### Get PCA projections

In [10]:
#| export
@delegates(cuml.PCA)
def get_PCA_prjs(X, cpu=False, **kwargs):
    r"""
    Computes PCA projections of X
    """
    if cpu:
        raise NotImplementedError
    else:
        reducer = cuml.PCA(**kwargs)
    projections = reducer.fit_transform(X)
    return projections

In [11]:
#| hide
# Test the function get_PCA_prjs
foo = np.random.rand(5, 10)
bar = get_PCA_prjs(foo, cpu=False, n_components=2)

### Get t-SNE projections

In [12]:
#| export
@delegates(cuml.TSNE)
def get_TSNE_prjs(X, cpu=False, **kwargs):
    r"""
    Computes TSNE projections of X
    """
    if cpu:
        raise NotImplementedError
    else:
        reducer = cuml.TSNE(**kwargs)
    projections = reducer.fit_transform(X)
    return projections

In [13]:
#| hide
# Test the function get_TSNE_prjs
foo = np.random.rand(90, 10)
bar = get_TSNE_prjs(foo, cpu=False)

  return func(**kwargs)


### Get PCA followed by UMAP projections


In [14]:
#| export
def get_PCA_UMAP_prjs(
    input_data : List [ np.float64 ], 
    cpu                = False, 
    check_memory_usage = True,
    verbose            = 0, 
    pca_kwargs         = {},
    umap_kwargs        = {}
):
    """
    Computes PCA -> UMAP projections of input data
    """
    if verbose > 1: print(f"About to compute PCA input_data~{input_data.shape}")
    prjs = get_PCA_prjs(
        X   = input_data, 
        cpu = cpu, 
        **pca_kwargs
    )

    if verbose > 0: print(f"PCA prjs~{prjs.shape}")
    if verbose > 1: print("About to compute UMAP")
        
    prjs = get_UMAP_prjs(
        input_data = prjs, 
        cpu        = cpu, 
        verbose = verbose -1,
        **umap_kwargs
    )

    if verbose > 0: print (f"UMAP prjs~{prjs.shape}")
    return prjs
    

In [15]:
#| hide
# Test the function get_TSNE_prjs
foo = np.random.rand(90, 10)
pca_kwargs = {'n_components':3}
umap_kwargs = {'random_state': 1234, 'n_neighbors': 3, 'min_dist':0.1}
bar = get_PCA_UMAP_prjs(
    input_data = foo, 
    cpu        = False, 
    verbose    = 1, 
    pca_kwargs = pca_kwargs, 
    umap_kwargs = umap_kwargs
)

PCA prjs~(90, 3)
--> get_UMAP_prjs
kwargs:  {'random_state': 1234, 'n_neighbors': 3, 'min_dist': 0.1}
27c57906f701ceec9f5ba94453d0de84
GPU | Used mem: 1
GPU | Used mem: 24
GPU | Memory Usage: [[90m--------------------[0m] [90m4%[0m
-- cuml.UMAP -- False
------- reducer --------
UMAP()
{'handle': <pylibraft.common.handle.Handle object at 0x7eff378a4540>, 'verbose': 4, 'output_type': 'input', 'n_neighbors': 3, 'n_components': 2, 'n_epochs': None, 'learning_rate': 1.0, 'min_dist': 0.1, 'spread': 1.0, 'set_op_mix_ratio': 1.0, 'local_connectivity': 1.0, 'repulsion_strength': 1.0, 'negative_sample_rate': 5, 'transform_queue_size': 4.0, 'init': 'spectral', 'a': 1.5769434601962196, 'b': 0.8950608779914887, 'target_n_neighbors': -1, 'target_weight': 0.5, 'target_metric': 'categorical', 'hash_input': False, 'random_state': 1234, 'callback': None, 'metric': 'euclidean', 'metric_kwds': None, 'precomputed_knn': None}
------- reducer --------
GPU | Used mem: 1
GPU | Used mem: 24
GPU | Memory Usa

## Cluster

In [16]:
#| export 

from sklearn.metrics import silhouette_score
def cluster_score(prjs, clusters_labels, verbose = 0):
    score = silhouette_score(prjs, clusters_labels)
    if verbose > 0:print("Silhouette_score:", score)
    return score

In [17]:
#| hide
#from nbdev.export import notebook2script
#notebook2script()
beep(1)