In [1]:
#| export
verbose                       = None 
show_plots                    = None
reset_kernel                  = None 
pre_configured_case           = False
case_id                       = None
frequency_factor              = 20
frequency_factor_change_alias = True
cuda_device                   = None
check_parameters              = True

## Checking input parameters

In [2]:
#| export
if check_parameters:
    print("--- Check parameters ---")
    print(
        "verbose:", verbose,
        "show_plots:",show_plots,
        "reset_kernel:",reset_kernel,
        "pre_configured_case:",pre_configured_case,
        "case_id:",case_id,
        "frequency_factor:", frequency_factor, 
        "frequency_factor_change_alias:", frequency_factor_change_alias,
        "cuda_device:", cuda_device
)

--- Check parameters ---
verbose: None show_plots: None reset_kernel: None pre_configured_case: False case_id: None frequency_factor: 20 frequency_factor_change_alias: True cuda_device: None


## Set default input parameter values ensuring no errors
### Values explained below in their natural execution place

In [3]:
#| export
verbose                       = 1  if verbose is None else verbose
show_plots                    = False if show_plots is None else show_plots
reset_kernel                  = False  if reset_kernel is None else reset_kernel
pre_configured_case           = True if pre_configured_case is None else pre_configured_case
case_id                       = 6 if case_id is None else case_id
frequency_factor              = 1 if frequency_factor is None else frequency_factor
frequency_factor_change_alias = True if frequency_factor_change_alias is None else frequency_factor_change_alias
cuda_device                   = 0 if  cuda_device is None else cuda_device

# Create artifact from time series dataframe
Gets a .tsf or .csv with a time serie, convert int to np.dataframe and loads it to weights and biases (W&B)

## Set-up
Initial notebook setup and specific debugging and pre-configured cases selection
### VsCode update patch
Initial notebook setup when using VSCode

In [4]:
#| export
import sys
import dvats.utils as ut
if '--vscode' in sys.argv:
    print("Executing inside vscode")
    ut.DisplayHandle.update = ut.update_patch

### Debugging variables
- `verbose`. If `> 0` it adds debbuging messages in those functions that allows so (eg. `get_enc_embeddings`)
- `reset_kernel`. If `True` it resets the kernel by the end of the execution. Use only in case that memory management is needed.
- `show_plots`. If `True` all plots are shown within the execution of the notebook. Otherwise, none of them will be plotted.

## Preconfigurated cases selection
- `pre_configured_case`. If `True`, a preconfigured case will be selected, forcing the artifact to get the expected configuration based on the information in `config\*.yml` and `utils\config.py`.
- `case_id`. If `preconfigured_case` is `True`, it forces to select the configuration of the `case_id` preconfigured samples. The available preconfigured samples are shown in the next cell.
- `frequency_factor`. If `pre_configured_case` is `True`, frequency will be resampled by `config.freq*frequency_factor`
  `frequency_factor_change_alias`. If `pre_configured_case` is `True` and `frequency_factor != 1` then the dataset alias will be modified for adding the new frequency as suffix.

In [5]:
#| export
import dvats.config as cfg_

In [6]:
#| hide
cfg_.show_available_configs()

Available datasets: 
0 - monash_australian_electricity_demand_0
1 - monash_solar_4_seconds_0
2 - wikipedia_0
3 - traffic_san_francisco_0
4 - monash_solar_10_minutes_0
5 - etth1_0
6 - stumpy_abp_0
7 - stumpy_toy_0


In [7]:
#| export 
##### ----- This cell should be substituted by input parameters ------ #####
##### See _ploomber_engine_example_.ipynb
##### Uncomment for direct Notebook execution
#pre_configured_case = False
#case_id = None
#frequency_factor = 1
#frequency_factor_change_alias = True

## Main code


In [8]:
#| export
import pandas as pd
import numpy as np
from fastcore.all import *
import wandb
from dvats.load import TSArtifact, infer_or_inject_freq
import pickle
import matplotlib
import matplotlib.pyplot as plt
from tsai.data.external import convert_tsf_to_dataframe
from tsai.utils import stack_pad

### Path and Artiffact configurattions
This notebook gets configuration from `config\base.yaml` and `config\01-dataset_artifact.yaml`

In [9]:
#| export
base_path = Path.home()

In [10]:
#| export
config = cfg_.get_artifact_config_sd2a(verbose = 0)
if pre_configured_case: 
    print(f"Pre configured case id: {case_id}")
    cfg_.force_artifact_config_sd2a(
        config = config, 
        id = case_id, 
        verbose = verbose, 
        both = verbose > 0, 
        frequency_factor = frequency_factor, 
        frequency_factor_change_alias = frequency_factor_change_alias
    )
cfg_.show_attrdict(config)

artifact_name: S1
csv_config: {}
data_cols: [13]
data_fpath: ~/data/synthetic_data/synthetic_segmentation(I).csv
date_format: %Y-%m-%d %H:%M:%S
date_offset: None
freq: 1m
joining_train_test: False
missing_values_technique: None
missing_values_constant: None
normalize_training: False
range_training: None
range_testing: None
resampling_freq: 20T
start_date: None
test_split: None
time_col: None
use_wandb: True
wandb_artifacts_path: ./data/wandb_artifacts


### Data Extraction

The data is assumed to come as a dataframe, either as a binarized  picke file or
as a csv file. It can also come as a `.tsf` file

#### Check file content (if wanted)

In [11]:
#| hide
if verbose  > 0:
    fpath=os.path.expanduser(config.data_fpath)
    print(fpath)
    try: 
        with open(fpath, 'r') as file:
            for _ in range(13):
                line = file.readline()
                print(line, end='')
        data, _, _, _, _ = convert_tsf_to_dataframe(fpath)
        print("Timestamp", data.start_timestamp)
    except Exception as e:
        print("Error while converting file. Maybe not a tsf: ", e)

/home/macu/data/synthetic_data/synthetic_segmentation(I).csv
Timestamp,1 hours component,2 hours component,3 hours component,4 hours component,6 hours component,8 hours component,12 hours component,Hourly component,Daily component,Weekly component,Noise,Seasonal time series,Seasonal time series w/ noise
2022-03-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.468216872228519,1.0,0.5317831277714811
2022-03-01 00:01:00,0.0,0.0314015737457663,0.0,0.039265422461809724,0.0,0.0,0.020071031646260045,0.09073802785383607,0.010035611354917113,0.0015583295883677347,-3.1711687339266006,1.102331968797121,-2.06883676512948
2022-03-01 00:02:00,0.0,0.06271707796059207,0.0,0.07850393436441575,0.0,0.0,0.04014053480575207,0.18136154713075991,0.02007103164626005,0.003116658571258016,1.4151376954645865,1.204549237348278,2.6196869328128645
2022-03-01 00:03:00,0.0,0.09386067902413851,0.0,0.11768864359176742,0.0,0.0,0.06020698110810823,0.27175630372401416,0.030106069814092213,0.004674986343193628,-3.190

#### Extract data

In [12]:
#| export
ext = str(config.data_fpath).split('.')[-1]

if ext == 'pickle':
    df = pd.read_pickle(config.data_fpath)
    
elif ext in ['csv','txt']:
    df = pd.read_csv(config.data_fpath, **config.csv_config)
    
elif ext == 'tsf':
    data, _, _, _, _ = convert_tsf_to_dataframe(os.path.expanduser(config.data_fpath))
    config.update({'start_date': data.start_timestamp[0]}, allow_val_change=True)
    date_format = config.date_format
    df = pd.DataFrame(stack_pad(data.series_value).T)
    
else:
    raise Exception('The data file path has an unsupported extension')


In [13]:
#| hide
if verbose > 0:
    print(f'File loaded successfully')
    print(df.shape)
    display(df.head())

File loaded successfully
(40320, 14)


Unnamed: 0,Timestamp,1 hours component,2 hours component,3 hours component,4 hours component,6 hours component,8 hours component,12 hours component,Hourly component,Daily component,Weekly component,Noise,Seasonal time series,Seasonal time series w/ noise
0,2022-03-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.468217,1.0,0.531783
1,2022-03-01 00:01:00,0.0,0.031402,0.0,0.039265,0.0,0.0,0.020071,0.090738,0.010036,0.001558,-3.171169,1.102332,-2.068837
2,2022-03-01 00:02:00,0.0,0.062717,0.0,0.078504,0.0,0.0,0.040141,0.181362,0.020071,0.003117,1.415138,1.204549,2.619687
3,2022-03-01 00:03:00,0.0,0.093861,0.0,0.117689,0.0,0.0,0.060207,0.271756,0.030106,0.004675,-3.190646,1.306537,-1.884108
4,2022-03-01 00:04:00,0.0,0.124747,0.0,0.156793,0.0,0.0,0.080269,0.361809,0.040141,0.006233,-1.053562,1.408182,0.35462


#### Set the time column (if any) as index

In [14]:
#| export
if config.time_col is not None:
    if verbose > 0: print("time_col: "+str(config.time_col))
    
    if isinstance(config.time_col, int): 
        if verbose > 0: print("Op 1: time_col int")
        datetime = df.iloc[:, config.time_col]
    
    elif isinstance(config.time_col, list): 
        if verbose > 0: print("Op 2: time_col list")
        datetime = df.iloc[:, config.time_col].apply(lambda x: x.astype(str).str.cat(sep='-'), axis=1)
    
    index = pd.DatetimeIndex(datetime)
    
    if config.date_offset:
        index += config.date_offset
    
    df = df.set_index(index, drop=False)   
    
    #Delete Timestamp col
    col_name = df.columns[config.time_col]
    
    if verbose > 0: print("... drop Timestamp col " + str(col_name))
    
    df = df.drop(col_name, axis=1)
    
if verbose > 0: display(df.head())

Unnamed: 0,Timestamp,1 hours component,2 hours component,3 hours component,4 hours component,6 hours component,8 hours component,12 hours component,Hourly component,Daily component,Weekly component,Noise,Seasonal time series,Seasonal time series w/ noise
0,2022-03-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.468217,1.0,0.531783
1,2022-03-01 00:01:00,0.0,0.031402,0.0,0.039265,0.0,0.0,0.020071,0.090738,0.010036,0.001558,-3.171169,1.102332,-2.068837
2,2022-03-01 00:02:00,0.0,0.062717,0.0,0.078504,0.0,0.0,0.040141,0.181362,0.020071,0.003117,1.415138,1.204549,2.619687
3,2022-03-01 00:03:00,0.0,0.093861,0.0,0.117689,0.0,0.0,0.060207,0.271756,0.030106,0.004675,-3.190646,1.306537,-1.884108
4,2022-03-01 00:04:00,0.0,0.124747,0.0,0.156793,0.0,0.0,0.080269,0.361809,0.040141,0.006233,-1.053562,1.408182,0.35462


#### Set dataframe frequency

In [15]:
#| export
df = infer_or_inject_freq(
    df, 
    injected_freq=config.freq, 
    start_date=config.start_date, 
    format=config.date_format
)
if verbose > 0: print(df.index.freq)

<Minute>


#### Select only the needed variables

In [16]:
#| export
# Subset of variables
if config.data_cols:
    if verbose > 0: print("data_cols: ", config.data_cols)
    df = df.iloc[:, config.data_cols]

if verbose > 0: print(f'Num. variables: {len(df.columns)}')

data_cols:  [13]
Num. variables: 1


#### Ensure data integrity

In [17]:
#| export
#Duplicated rows
if verbose > 0: print("df shape before dropping duplicates", df.shape)
df.drop_duplicates()
if verbose > 0: print("df shape after dropping duplicates", df.shape)
# Verificar si hay duplicados en el índice del dataframe
if df.index.duplicated().any():
    raise ValueError("Duplicated index names")

df shape before dropping duplicates (40320, 1)
df shape after dropping duplicates (40320, 1)


In [18]:
#| export
# Replace the default missing values by np.NaN
if config.missing_values_constant:
    df.replace(config.missing_values_constant, np.nan, inplace=True)

#### Show time series plot

In [19]:
#| hide
if show_plots:
    # Show time series plot
    fig, ax = plt.subplots(1, figsize=(15,5), )
    cmap = matplotlib.colormaps.get_cmap('viridis')
    #df.plot(color=cmap(0.05), ax=ax) # or use colormap=cmap
    df.plot(colormap=cmap, ax=ax) # or use colormap=cmap
    # rect = Rectangle((5000, -4.2), 3000, 8.4, facecolor='lightgrey', alpha=0.5)
    # ax.add_patch(rect)
    plt.tight_layout()
    plt.legend()
    display(plt.show())

### Data Transformation

__Handle Missing Values, Resample and Normalize__

> In this second part, Time Series Artifact (TSArtifact) object can be created and missing values handling techniques, resampling and normalization can be applied.
> 
> This techniques should be applied on the three subsets that must be previously created: training, validation and testing.

#### Training data

##### Build dataframe

In [20]:
#| export
rg = config.range_training

if isinstance(rg, list):
    rg_training = rg
    
elif isinstance(rg, dict):
    rg_training = pd.date_range(rg['start'], rg['end'], freq=rg['freq'])
    
elif config.test_split:
    rg_training = df.index[:math.ceil(len(df) * (1-config.test_split))]

else:
    rg_training = None
    
df_training = df[df.index.isin(rg_training)] if rg_training is not None else df

##### Build training artifact

In [21]:
df.shape

(40320, 1)

In [22]:
df_training.shape

(40320, 1)

In [23]:
print(config.resampling_freq)

20T


In [24]:
#| export
training_artifact = TSArtifact.from_df(
    df_training, 
    name                    = config.artifact_name, 
    missing_values_technique= config.missing_values_technique,
    resampling_freq         = config.resampling_freq, 
    normalize               = config.normalize_training, 
    path                    = str(Path.home()/config.wandb_artifacts_path),
    verbose                 = 2
)
if verbose > 0: display(training_artifact.metadata)

[ From df ] sd 1970-01-01 00:00:00, ed 1970-01-28 23:59:00
[ From df ] df_query~(40320, 1)
[ From df ] df_missing~(40320, 1)
[ From df ] df_resampled~(2016, 1)
About to write df to  /home/macu/data/wandb_artifacts/-110420312406903399


{'TS': {'sd': '1970-01-01 00:00:00',
  'ed': '1970-01-28 23:59:00',
  'created': 'from-df',
  'n_vars': 1,
  'handle_missing_values_technique': 'None',
  'has_missing_values': 'False',
  'n_samples': 2016,
  'freq': '<20 * Minutes>',
  'vars': ['Seasonal time series w/ noise'],
  'hash': '-110420312406903399'}}

In [25]:
#| export
#Debugging 
if df_training.index.duplicated().any():
    raise ValueError("Duplicated index names")

#### Testing data

##### Build dataframe & artifact

In [26]:
#| export
# Testing data
rg = config.range_testing

if rg or config.test_split:
    
    if isinstance(rg, list):
        rg_testing = rg

    elif isinstance(rg, dict):
        rg_testing = pd.date_range(rg['start'], rg['end'], freq=rg['freq'])

    elif config.test_split:
        rg_testing = df.index[math.ceil(len(df) * (1 - config.test_split)):]

    else:
        rg_testing = None
    
    df_testing = df[df.index.isin(rg_testing)]
    testing_artifact = TSArtifact.from_df(df_testing,
                                          name=config.artifact_name, 
                                          missing_values_technique=config.missing_values_technique,
                                          resampling_freq=config.resampling_freq, 
                                          normalize=False,
                                          path=str(Path.home()/config.wandb_artifacts_path))
    display(testing_artifact.metadata)
    if df_testing.index.duplicated().any():
        print("There exist duplicated value(s) in the index dataframe.")
    else:
        if verbose > 0: print("There is no duplicated value in the index dataframe.")
else:
    if verbose > 0: print("rg "+ str(rg) + " | test_split "+ str(config.test_split))
    testing_artifact = None

rg None | test_split None


#### Training + Testing data

##### Build dataframe & artifact

In [27]:
#| export
# Training + Testing data
if(config.joining_train_test):
    print("joining_train_test: "+ str(config.joining_train_test))
    df_train_test = pd.concat([df_training, df_testing])
    train_test_artifact = TSArtifact.from_df(
        df_train_test,
        name=config.artifact_name, 
        missing_values_technique=config.missing_values_technique,
        resampling_freq=config.resampling_freq, 
        normalize=False,
        path=str(Path.home()/config.wandb_artifacts_path)
    )
    if df_train_test.index.duplicated().any():
        print("There exist duplicated value(s) within the dataframe index.")
    else:
        if verbose > 0: print("There is no duplicated value in the dtaframe index")
    if verbose > 0: display(train_test_artifact.metadata)
else:
    train_test_artifact = None

### Storing artifacts

For the experiment tracking and hyperparameter we will use the tool **Weights & Biases**. 

> 
Before running this notebook part, make sure you have the `$WANDB_API_KEY`, `$WANDB_ENTITY` and `$WANDB_PROJECT` environment varibales defined with your API_KEY and your ENTITY and PROJECT names (run in a terminal `echo $WANDB_API_KEY` to see it, same with the other variables). If not, run in a terminal `wandb login [API_KEY]` to set the first one. You can see your API_KEY [here](https://wandb.ai/authorize) or in the settings of your W&B account. Run in a terminal `export WANDB_ENTITY=entity_name` and/or `export WANDB_PROJECT=project_name` to set the other two
> 
> <span style="color:red"> TODO: Modify config.ipynb so it gets wandb config from base.yml </span>.

In [28]:
#| export
import os
path = os.path.expanduser("~/work/nbs_pipeline/")
name="01_dataset_artifact"
os.environ["WANDB_NOTEBOOK_NAME"] = path+name+".ipynb"
runname=name
print("runname: "+runname)

runname: 01_dataset_artifact


In [29]:
#| export
mode = 'online' if config.use_wandb else 'disabled'

# Make the run that will produce the artifact
with wandb.init(job_type='create_dataset', resume=True, mode=mode, config=config, name=runname) as run:
    if testing_artifact: 
        run.log_artifact(training_artifact, aliases=['train'])
        run.log_artifact(testing_artifact, aliases=['test'])
        
        if train_test_artifact:
            run.log_artifact(train_test_artifact, aliases=['all'])
    
    else:
        run.log_artifact(training_artifact, aliases=['all'])

[34m[1mwandb[0m: Currently logged in as: [33mmi-santamaria[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [30]:
#| export
run.finish()

In [31]:
#| export
from dvats.imports import beep
print("Execution ended")
beep(1)

Execution ended


In [32]:
#| hide
if reset_kernel:
    import os
    os._exit(00)

In [33]:
df_training.head()

Unnamed: 0,Seasonal time series w/ noise
1970-01-01 00:00:00,0.531783
1970-01-01 00:01:00,-2.068837
1970-01-01 00:02:00,2.619687
1970-01-01 00:03:00,-1.884108
1970-01-01 00:04:00,0.35462
