In [None]:
#| default_exp utils

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

# Utils

> Utilities used in the rest of the notebooks

In [None]:
#| export
from dvats.imports import *
from fastcore.all import *
import wandb
import pickle
import pandas as pd
import numpy as np
#import tensorflow as tf
import torch.nn as nn
from fastai.basics import *

### Generate random time series dataframe

In [None]:
#| export
def generate_TS_df(rows, cols):
    "Generates a dataframe containing a multivariate time series, where each column \
    represents a variable and each row a time point (sample). The timestamp is in the \
    index of the dataframe, and it is created with a even space of 1 second between samples"
    index = np.arange(pd.Timestamp.now(),
                      pd.Timestamp.now() + pd.Timedelta(rows-1, 'seconds'),
                      pd.Timedelta(1, 'seconds'))
    data = np.random.randn(len(index), cols)
    return pd.DataFrame(data, index=index)

In [None]:
df = generate_TS_df(3, 5)

In [None]:
test_eq(df.shape, (3, 5))

##  pandas Dataframe utilities

### Normalize columns

In [None]:
#| export
def normalize_columns(df:pd.DataFrame):
    "Normalize columns from `df` to have 0 mean and 1 standard deviation"
    mean = df.mean()
    std = df.std() + 1e-7
    return (df-mean)/std

In [None]:
foo = generate_TS_df(3, 3)
foo.describe()

Unnamed: 0,0,1,2
count,3.0,3.0,3.0
mean,-0.356446,-0.217669,-0.451268
std,1.567722,0.268632,0.309269
min,-1.327362,-0.48047,-0.693929
25%,-1.260753,-0.354722,-0.625384
50%,-1.194144,-0.228973,-0.55684
75%,0.129012,-0.086268,-0.329938
max,1.452169,0.056437,-0.103037


In [None]:
bar = normalize_columns(foo)
bar.describe()

Unnamed: 0,0,1,2
count,3.0,3.0,3.0
mean,-3.700743e-17,-3.700743e-17,-9.251859e-17
std,0.9999999,0.9999996,0.9999997
min,-0.6193166,-0.9782957,-0.7846253
25%,-0.5768289,-0.5101875,-0.5629913
50%,-0.5343411,-0.04207935,-0.3413573
75%,0.3096583,0.4891479,0.3923127
max,1.153658,1.020375,1.125983


In [None]:
test_close(bar.describe().loc['mean'].values, np.repeat(0.0, len(bar.columns)))

In [None]:
test_close(bar.describe().loc['std'].values, np.repeat(1.0, len(bar.columns)))

### Remove constant columns

In [None]:
#| export
def remove_constant_columns(df:pd.DataFrame):
    return df.loc[:, (df != df.iloc[0]).any()]

In [None]:
foo = generate_TS_df(3, 3)
foo['constant'] = [0.0]*len(foo)
foo

Unnamed: 0,0,1,2,constant
2022-04-23 10:32:47.134188,-0.658749,1.680371,1.426223,0.0
2022-04-23 10:32:48.134188,1.961724,1.247468,-1.358013,0.0
2022-04-23 10:32:49.134188,-0.195324,0.413195,0.504546,0.0


In [None]:
bar = remove_constant_columns(foo)
bar

Unnamed: 0,0,1,2
2022-04-23 10:32:47.134188,-0.658749,1.680371,1.426223
2022-04-23 10:32:48.134188,1.961724,1.247468,-1.358013
2022-04-23 10:32:49.134188,-0.195324,0.413195,0.504546


In [None]:
column_diff = set(foo.columns) - set(bar.columns)
test_eq_type(column_diff, set(['constant']))

## Create wandb artifact containing just the reference to an object pass as argument

In [None]:
#| export
class ReferenceArtifact(wandb.Artifact):
    default_storage_path = Path('data/wandb_artifacts/') # * this path is relative to Path.home()
    "This class is meant to create an artifact with a single reference to an object \
    passed as argument in the contructor. The object will be pickled, hashed and stored \
    in a specified folder."
    @delegates(wandb.Artifact.__init__)
    def __init__(self, obj, name, type='object', folder=None, **kwargs):
        super().__init__(type=type, name=name, **kwargs)
        # pickle dumps the object and then hash it
        hash_code = str(hash(pickle.dumps(obj)))
        folder = Path(ifnone(folder, Path.home()/self.default_storage_path))
        with open(f'{folder}/{hash_code}', 'wb') as f:
            pickle.dump(obj, f)
        self.add_reference(f'file://{folder}/{hash_code}')
        if self.metadata is None:
            self.metadata = dict()
        self.metadata['ref'] = dict()
        self.metadata['ref']['hash'] = hash_code
        self.metadata['ref']['type'] = str(obj.__class__)

In [None]:
foo = np.arange(10)
bar = ReferenceArtifact(obj=foo, name='foo', folder='.')
bar_path = Path(f'./{bar.metadata["ref"]["hash"]}')
test_eq(bar_path.exists(), True)
test_eq(bar.metadata['ref']['type'], "<class 'numpy.ndarray'>")

When a reference artifact is used by one wandb run, we should have a method to get the original object from it

In [None]:
#| export
@patch
def to_obj(self:wandb.apis.public.Artifact):
    """Download the files of a saved ReferenceArtifact and get the referenced object. The artifact must \
    come from a call to `run.use_artifact` with a proper wandb run."""
    if self.metadata.get('ref') is None:
        print(f'ERROR:{self} does not come from a saved ReferenceArtifact')
        return None
    original_path = ReferenceArtifact.default_storage_path/self.metadata['ref']['hash']
    path = original_path if original_path.exists() else Path(self.download()).ls()[0]
    with open(path, 'rb') as f:
        obj = pickle.load(f)
    return obj

Test with Reference artifact from a df

In [None]:
foo = generate_TS_df(3, 3)
bar = ReferenceArtifact(obj=foo, name='test_reference_artifact')
bar.manifest.entries.values()

dict_values([<ManifestEntry ref: file:///home/dmontalvo/data/wandb_artifacts/776758047213944476/776758047213944476>])

In [None]:
test_eq(bar.name, 'test_reference_artifact')

In [None]:
test_eq(bar.metadata['ref']['type'], str(type(foo)))

TODO: Test method `to_obj`

ReferenceArtifact with a numpy array

In [None]:
foo = np.random.randn(5)
bar = ReferenceArtifact(obj=foo, name='test_reference_artifact')
bar.manifest.entries.values()

dict_values([<ManifestEntry ref: file:///home/dmontalvo/data/wandb_artifacts/-8161467246553937997/-8161467246553937997>])

In [None]:
test_eq(bar.metadata['ref']['type'], str(type(foo)))

In [None]:
#| export
import torch.nn as nn
class PrintLayer(nn.Module):
    def __init__(self):
        super(PrintLayer, self).__init__()

    def forward(self, x):
        # Do your print / debug stuff here
        print(x.shape)
        return x

In [None]:
#| export
@patch
def export_and_get(self:Learner, keep_exported_file=False):
    """
        Export the learner into an auxiliary file, load it and return it back.
    """
    aux_path = Path('aux.pkl')
    self.export(fname='aux.pkl')
    aux_learn = load_learner('aux.pkl')
    if not keep_exported_file: aux_path.unlink()
    return aux_learn

### get_wandb_artifacts

In [None]:
#| export
def get_wandb_artifacts(project_path, type=None, name=None, last_version=True):
    """
        Get the artifacts logged in a wandb project.
        Input:
        - `project_path` (str): entity/project_name
        - `type` (str): whether to return only one type of artifacts
        - `name` (str): Leave none to have all artifact names
        - `last_version`: whether to return only the last version of each artifact or not

        Output: List of artifacts
    """
    public_api = wandb.Api()
    if type is not None:
        types = [public_api.artifact_type(type, project_path)]
    else:
        types = public_api.artifact_types(project_path)

    res = L()
    for kind in types:
        for collection in kind.collections():
            if name is None or name == collection.name:
                versions = public_api.artifact_versions(
                    kind.type,
                    "/".join([kind.entity, kind.project, collection.name]),
                    per_page=1,
                )
                if last_version: res += next(versions)
                else: res += L(versions)
    return list(res)

In [None]:
foo = get_wandb_artifacts('wandb/artifacts-example', type='model')
test_eq(len(foo), 2)
foo = get_wandb_artifacts('wandb/artifacts-example', type='model', name='convnet')
test_eq(len(foo), 1)
foo = get_wandb_artifacts('wandb/artifacts-example', type='model', name='convnet', last_version=False)
test_eq(len(foo), 2)

### get_pickle_artifact

In [None]:
#| export
def get_pickle_artifact(filename):

    with open(filename, "rb") as f:
        df = pickle.load(f)
    
    return df

## Pickled
Functions execution loading/saving input/output as pickled objects

In [None]:
#| export
def pickled_file(function, filename=None, hash_code = None, include_hash_code = True):
    filename_is_none = filename is None
    if filename_is_none:
        filename = f"result_{function.__name__}"
    if not hash_code is None and (include_hash_code or filename_is_none):
        filename = f"{filename}_{hash_code}"
    filename = f"{filename}.pickle"
    return filename

In [None]:
#| hide
filename = None
hash_code = None
def foo(a,b,c=1): 
    print(f"({a}+{b})*{c}")
    return (a+b)*c
    
print(f"{filename} | {hash_code}")
print(pickled_file(foo, filename, hash_code))

hash_code = 1
print(f"{filename} | {hash_code}")
print(pickled_file(foo,filename, hash_code))

filename = "Potato"
print(f"{filename} | {hash_code}")
print(pickled_file(foo, filename, hash_code))

print(f"{filename} | {hash_code}")
print(pickled_file(foo, filename, hash_code, False))

filename = None
print(f"{filename} | {hash_code}")
print(pickled_file(foo, filename, hash_code, False))

None | None
result_foo.pickle
None | 1
result_foo_1.pickle
Potato | 1
Potato_1.pickle
Potato | 1
Potato.pickle
None | 1
result_foo_1.pickle


In [None]:
def exec_and_pickle(function, filename=None, *args, **kwargs):
    result = function(*args, **kwargs)
    hash_code = str(hash(pickle.dumps(result)))
    filename = pickled_file(function, filename, hash_code)
    with open(filename, 'wb') as f:
        pickle.dump(result,f)
    return filename, hash_code

In [None]:
print(foo(1,2))
filename, hash_code = exec_and_pickle(foo, None, 1, 2)
print(filename)
print(hash_code)

(1+2)*1
3
(1+2)*1
result_foo_6814786563595406894.pickle
6814786563595406894


In [None]:
def exec_with_pickle(function, filename = "", hash_code = None, include_hash_code = True, print_flag=False, *args, **kwargs): 
    filename = pickled_file(function, filename, hash_code, include_hash_code)
    print(filename)
    with open(filename, 'rb') as f:
        input = pickle.load(f)
    if print_flag: 
        print( f"input: {input}, args: {args}, kwargs: {kwargs}")
    result = function(input, *args, **kwargs) 
    return result

In [None]:
exec_with_pickle(foo, None, hash_code, True, True, 4, 2)

result_foo_6814786563595406894.pickle
input: 3, args: (4, 2), kwargs: {}
(3+4)*2


14

In [None]:
import os
def remove_pickled(function, filename = None, hash_code = None, include_hash_code = True, print_flag = False):
    filename = pickled_file(function, filename, hash_code, include_hash_code)
    try:
        os.remove(filename)
        if print_flag: print(f"Filename {filename} deleted")
    except FileNotFoundError:
        print(f"File {filename} not found")
    except Exception as e: 
        print(f"Error deleting {filename}: {e}")

In [None]:
remove_pickled(foo, None, hash_code, True, True)

Filename result_foo_6814786563595406894.pickle deleted


## Export

In [None]:
#| hide
#from nbdev.export import notebook2script
#notebook2script()
beep(1)

Converted dr.ipynb.
Converted encoder.ipynb.
Converted index.ipynb.
Converted load.ipynb.
Converted utils.ipynb.
Converted visualization.ipynb.
