In [3]:
import sys
sys.path.append('.')

In [2]:
# pip install azure-storage-blob --upgrade

In [4]:
from helpers.dataprep_utils import compute_missing_ratio
from helpers.sas_utils import get_data_link, get_placeholder_link
from helpers.service_utils import RestEDDI

In [5]:
import pandas as pd
import os
import toml

In [6]:
df_sensors = pd.read_csv('./data_generated/sensor_wide.csv', index_col=0)
df_sensors.head(3)

Unnamed: 0_level_0,IN1,IN2,IN3,IN4,IN5,IN6,IN7,IN8,Out1,Out2
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.077744,0.795565,-0.665503,0.879321,0.134419,-1.133765,0.253945,0.109987,-0.122686,0.123661
1,0.080313,0.824595,-0.655447,0.875636,0.134941,-1.212052,1.661342,-0.090342,-0.122686,0.123661
2,0.087355,0.776258,-0.65055,0.884105,0.132452,-1.294233,0.399097,-0.255465,-0.026857,0.123661


In [7]:
compute_missing_ratio(df_sensors)

Unnamed: 0,Missing Ratio
IN4,0.763836
IN8,0.743004
IN3,0.743004
IN2,0.743004
IN1,0.743004
Out1,0.687452
IN5,0.687452
IN6,0.680508
Out2,0.576349
IN7,0.576349


## EDDI Initial Setup

Loads **container_sas_key** that we want to upload the train/test data; we create a directory in that container with the name that was set up in _blob_storage_root_. It also loads **subscription_key** that we recieved after subscribing for EDDI-api.

In [8]:
config = toml.load("config/config.toml")

container_sas_link = config['blob']['container_sas_link']
subscription_key = config['eddi']['subscription_key']

Use helper functions to upload data to blob storage & create SAS link; also use RestEDDI to do facilitate REST request for train and batch inference:

In [9]:
rest_eddi = RestEDDI(subscription_key, api_version="v2.3")

Select the name for local directory & directory on blob to host the data

In [10]:
local_storage_root = './data_temp/'
blob_storage_root = 'sensor_datasets'

if not os.path.exists(local_storage_root): os.makedirs(local_storage_root)
df_sensors.to_csv(local_storage_root + 'sensor_wide.csv', index=False, header=False)

# blob storage
training_data_source = get_data_link(local_storage_root + 'sensor_wide.csv', container_sas_link, directory_name=blob_storage_root)

EDDI-MVP Train Parameter Setup

In [22]:
variables_metadata = []
epsilon = 0.01

# we need to create a list of dict for columns' meta-data
for idx, col in enumerate(df_sensors.columns):
    col_info = {
            "query": True,
            "type": "continuous",
            "name": col,
            "lower": df_sensors[col].min() - epsilon *  df_sensors[col].min(),
            "upper": df_sensors[col].max() + epsilon * df_sensors[col].max()
    }
    variables_metadata.append(col_info)

In [23]:
train_input = {
    "training_data_source": training_data_source,
    "model_hyperparams":{
        "decoder_variances": 1e-6
    },
    "variables_metadata": variables_metadata,
    "training_hyperparams":{
        "epochs": 1000,
        "iterations":400
    }
}

EDDI-MVP train request

To keep the notebook cleaner, we model the following three lines into another function and will use that one on the next notebooks: 

In [24]:
operation_id = rest_eddi.train_model(train_input)

<Response [200]>
200
'a7de641d66024984b8c384a500574c0a'


In [25]:
# check the status of the operation until its done
status = rest_eddi.wait_for_operation_to_complete(operation_id, operation_name="train")

Operation status: Completed
train running time: 1972.4830317497253 seconds


In [26]:
model_id = operation_id

Imputate the same dataset

In [29]:
# prediction on the same dataset
testing_data_source = training_data_source
imputed_data_source = get_placeholder_link(local_storage_root + 'sensor_wide_impute.csv', container_sas_link, directory_name=blob_storage_root, delete_prev=True)

In [30]:
batch_inference_input = {
    "hyperparameters":
    {
        "sample_count": 50
    },
    "data_source": testing_data_source,
    "output": imputed_data_source
}

In [31]:
operation_id = rest_eddi.batch_inference(model_id, batch_inference_input)

<Response [200]>
200
'a9e1f8f1dcc24d06908c4e3cd4a0fcf5'


In [41]:
status = rest_eddi.wait_for_operation_to_complete(operation_id, operation_name="batch_inference")

Download and store data locally

In [48]:
df_sensors_imputed = pd.read_csv(imputed_data_source, names=df_sensors.columns)

In [49]:
df_sensors_imputed.head(3)

Unnamed: 0,IN1,IN2,IN3,IN4,IN5,IN6,IN7,IN8,Out1,Out2
0,0.077744,0.795565,-0.665503,0.879322,0.134419,-1.133765,0.253945,0.109987,-0.122686,0.123661
1,0.080313,0.824595,-0.655447,0.875636,0.134941,-1.212052,1.661342,-0.090342,-0.122686,0.123661
2,0.087355,0.776258,-0.65055,0.884105,0.132452,-1.294233,0.399097,-0.255465,-0.026857,0.123661


In [58]:
compute_missing_ratio(df_sensors_imputed)

Unnamed: 0,Missing Ratio


In [54]:
df_sensors_imputed.to_csv('./data_prepared/prep_sensor_data.csv')