# Getting the embeddings

> This notebook gets the embeddings (or latent space) from a multivariate time series 
given by a encoder (e.g., autoencoder)

> It uses sliding window view instead of sliding window for splitting the dataset

In [1]:
# This is only needed if the notebook is run in VSCode
import sys
if '--vscode' in sys.argv:
    print("Executing inside vscode")
    import nbs_pipeline.utils.vscode  as vs
    vs.DisplayHandle.update = vs.update_patch

In [2]:
from dvats.all import *
from tsai.data.preparation import prepare_forecasting_data
from tsai.data.validation import get_long_term_forecasting_splits, get_forecasting_splits
from fastcore.all import *
import wandb
wandb_api = wandb.Api()
from yaml import load, FullLoader

In [3]:
check_memory_usage = True

In [4]:
if check_memory_usage:
    import nbs_pipeline.utils.memory as mem
    import torch 
    gpu_device = torch.cuda.current_device()
    mem.gpu_memory_status(gpu_device)

Used mem: 3141
Used mem: 24576
Used mem: 13%


## Get configuration parameters from yml
> Configuration parameters are obtained from 'config\03-embeddings.yaml'

### Get configuration artifact

In [5]:
import utils.config as cfg
config, job_type = cfg.get_artifact_config_embeddings(print_flag = False)

### Show configuration artifact

In [6]:
for key, value in config.items():
    print(f"{key}: {value}")

use_wandb: True
wandb_group: embeddings
wandb_entity: mi-santamaria
wandb_project: deepvats
enc_artifact: mi-santamaria/deepvats/mvp-SWV:latest
input_ar: None
cpu: False


## Build W&B artifact

In [7]:
import os
path = os.path.expanduser("~/work/nbs_pipeline/")
name="03b_embeddings-sliding_window_view"
os.environ["WANDB_NOTEBOOK_NAME"] = path+name+".ipynb"
runname=name
print("runname: "+runname)

runname: 03b_embeddings-sliding_window_view


In [8]:
run = wandb.init(
    entity      = config.wandb_entity,
    project     = config.wandb_project if config.use_wandb else 'work-nbs', 
    group       = config.wandb_group,
    job_type    = job_type,
    mode        = 'online' if config.use_wandb else 'disabled',
    anonymous   = 'never' if config.use_wandb else 'must',
    config      = config,
    resume      = 'allow',
    name        = runname
)

wandb: Currently logged in as: mi-santamaria. Use `wandb login --relogin` to force relogin


'stream.Stream' object attribute 'write' is read-only


## Get trained model artifact

### Build artifact selector
> Botch to use artifacts offline

In [9]:
artifacts_gettr = run.use_artifact if config.use_wandb else wandb_api.artifact

### Get the model from W&B
> Restore the encoder model and its associated configuration

In [10]:
enc_artifact = artifacts_gettr(config.enc_artifact, type='learner')

In [11]:
enc_artifact.metadata

{'r': 0.71,
 'w': 365,
 'MVP': {'r': 0.71,
  'lm': 3,
  'crit': None,
  'sync': False,
  'fname': 'encoder_MVP',
  'dropout': 0.1,
  'verbose': False,
  'stateful': True,
  'save_best': True,
  'nan_to_num': 0,
  'custom_mask': None,
  'future_mask': False,
  'weights_path': None,
  'variable_mask': False,
  'subsequence_mask': True},
 'ref': {'hash': '8540811841704731727',
  'type': "<class 'fastai.learner.Learner'>"},
 'alias': 'sunspot',
 'n_inp': 1,
 'device': 'cuda',
 'epochs': 100,
 'frozen': False,
 'mvp_ws': [7, 365],
 'stride': 900,
 'Learner': {'lr': 0.001,
  'wd': None,
  'arch': 'tsai.models.InceptionTimePlus.InceptionTimePlus',
  'moms': [0.95, 0.85, 0.95],
  'path': '.',
  '_name': '<fastai.learner.Learner object at 0x7f8bcf0710f0>',
  'metrics': None,
  'opt_func': 'fastai.optimizer.Adam',
  'splitter': 'tsai.models.utils.ts_splitter',
  'train_bn': True,
  'loss_func': {'axis': -1,
   '_name': {'axis': -1,
    '_name': 'FlattenedLoss of MSELoss()',
    'is_2d': False,
 

In [12]:
print("enc_artifact: "+enc_artifact.name)

enc_artifact: mvp-SWV:v7


In [13]:
# TODO: This only works when you run it two timeS! WTF?
try:
    enc_learner = enc_artifact.to_obj()
except:
    enc_learner = enc_artifact.to_obj()

wandb:   1 of 1 files downloaded.  


## Get dataset artifact from W&B
### Restore the dataset artifact used for training the encoder. 
> Even if we do not compute the dimensionality reduction over this dataset, we need to know the metadata of the encoder training set, to check that it matches with the dataset that we want to reduce.

In [14]:
enc_run = enc_artifact.logged_by()
enc_artifact_train = artifacts_gettr(enc_run.config['train_artifact'], type='dataset')
enc_artifact_train.name

'sunspot:v3'

### Specify the dataset artifact that we want to get the embeddings from
> If no artifact is defined, the artifact to reduce will be the one used for validate the encoder.

In [15]:
input_ar_name = ifnone(
    config.input_ar, 
    f'{enc_artifact_train.entity}/{enc_artifact_train.project}/{enc_artifact_train.name}'
)
wandb.config.update({'input_ar': input_ar_name}, allow_val_change=True)
input_ar = artifacts_gettr(input_ar_name)
input_ar.name

'sunspot:v3'

In [16]:
df = input_ar.to_df()
df.head()

wandb:   1 of 1 files downloaded.  


Unnamed: 0,0
1818-01-08,65.0
1818-01-09,
1818-01-10,
1818-01-11,
1818-01-12,


In [17]:
df.shape

(73924, 1)

### Split data with Sliding Window

In [18]:
import time

In [19]:
w = enc_run.config['w']
print(w)

365


In [20]:
if check_memory_usage: mem.gpu_memory_status(gpu_device)

Used mem: 3141
Used mem: 24576
Used mem: 13%


In [21]:
t_start = time.time()
enc_input, _ = prepare_forecasting_data(df, fcst_history = w)
t_end = time.time()
t = t_end - t_start
print("SW start | " , t_start, " | end ", t_end, "total (secs): ", t)
print(enc_input.shape)

SW start |  1701190449.5197039  | end  1701190449.520824 total (secs):  0.0011200904846191406
(73559, 1, 365)


In [22]:
if check_memory_usage: mem.gpu_memory_status(gpu_device)

Used mem: 3141
Used mem: 24576
Used mem: 13%


### Get embedings

In [23]:
stride = enc_run.config['stride']
batch_size = enc_run.config['batch_size']
#enc_learner.dls.bs = enc_run.config['batch_size']

In [24]:
print(stride)
print(batch_size)

900
512


In [25]:
print(enc_input.shape)
print(enc_artifact.name)

(73559, 1, 365)
mvp-SWV:v7


In [26]:
if check_memory_usage: mem.gpu_memory_status(gpu_device)

Used mem: 3141
Used mem: 24576
Used mem: 13%


In [27]:
print ("Memoria usada")
! nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1
print("Memoria total")
! nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | awk 'NR==1{print $1}'
print("porcentaje")
! echo $(( 100 * $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) / $(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -n 1) ))

Memoria usada
3141
Memoria total
24576
porcentaje
12


In [28]:
t_start = time.time()
embs = get_enc_embs_set_stride_set_batch_size(
    X = enc_input, 
    enc_learn = enc_learner, 
    stride = stride, 
    batch_size = batch_size, 
    cpu=config.cpu, 
    to_numpy=True, 
    print_flag = False,
    time_flag = True
)
t_end = time.time()
t = t_end - t_start
print("GE start | " , t_start, " | end ", t_end, "total (secs): ", t)

get_enc_embs_set_stride_set_batch_size 1.8747472763061523 seconds
GE start |  1701190451.475051  | end  1701190453.3505607 total (secs):  1.8755097389221191


In [29]:
if check_memory_usage: mem.gpu_memory_status(gpu_device)

Used mem: 3499
Used mem: 24576
Used mem: 14%


In [30]:
if config.use_wandb: 
    run.log_artifact(ReferenceArtifact(embs, 'embeddings-SWV', metadata=dict(run.config)), 
                     aliases=f'run-{run.project}-{run.id}')

In [31]:
run.finish()

VBox(children=(Label(value='0.006 MB of 0.006 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

# Free GPU

In [32]:
if check_memory_usage: mem.gpu_memory_status(gpu_device)

Used mem: 3499
Used mem: 24576
Used mem: 14%


In [None]:
os._exit(00)