In [1]:
import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace

In [2]:
!pip install FRB

Collecting FRB
  Downloading https://files.pythonhosted.org/packages/48/ce/db08a6ee92b6b0eeb71f9ddbab45177488a579ce8d5600a7b7884ed97d36/FRB-1.1.4-py2.py3-none-any.whl
Installing collected packages: FRB
Successfully installed FRB-1.1.4
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [4]:
ws = Workspace.from_config()
default_store = ws.get_default_datastore()

In [5]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget

aml_compute = ws.get_default_compute_target("CPU")

if aml_compute is None:
    amlcompute_cluster_name = "cpu-cluster2"
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_D3_V2",
                                                                admin_username='admin',
                                                                admin_user_password='pass',
                                                                max_nodes = 4)

    aml_compute = ComputeTarget.create(ws, amlcompute_cluster_name, provisioning_config)
    aml_compute.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)

aml_compute

Succeeded
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


AmlCompute(workspace=Workspace.create(name='AYTML', subscription_id='ea632140-cfc8-4028-bf65-ae3111412de0', resource_group='AYTML-rg'), name=cpu-cluster2, id=/subscriptions/ea632140-cfc8-4028-bf65-ae3111412de0/resourceGroups/AYTML-rg/providers/Microsoft.MachineLearningServices/workspaces/AYTML/computes/cpu-cluster2, type=AmlCompute, provisioning_state=Succeeded, location=eastus2, tags=None)

In [6]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# Create a new runconfig object
aml_run_config = RunConfiguration()

# Use the aml_compute you created above. 
aml_run_config.target = aml_compute

# Enable Docker
aml_run_config.environment.docker.enabled = True

# Set Docker base image to the default CPU-based image
aml_run_config.environment.docker.base_image = "mcr.microsoft.com/azureml/base:0.2.1"

# Use conda_dependencies.yml to create a conda environment in the Docker image for execution
aml_run_config.environment.python.user_managed_dependencies = False

# Auto-prepare the Docker image when used for execution (if it is not already prepared)
aml_run_config.auto_prepare_environment = True

# Specify CondaDependencies obj, add necessary packages
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas','scikit-learn'], 
    pip_packages=['azureml-sdk', 'azureml-dataprep', 'azureml-train-automl', 'FRB'], 
    pin_sdk_version=False)

print ("Run configuration created.")



Run configuration created.


In [7]:
## Get Data Locally
import pandas as pd
import os
import sys
sys.path.append('/home/nbuser/library/')
from Ingestion import create_sql_engine
dataDir = "data"

if not os.path.exists(dataDir):
    os.mkdir(dataDir)
    
engine = create_sql_engine(uid='database_login', password='password-123')
DATABASE_TABLES = ['RATES', 'INFLATION']#, 'ECONOMY', 'CREDIT', 'AGRICULTURE']
for t in DATABASE_TABLES:
    curr_data = dataDir + f"/curr_data/{t}"

    if not os.path.exists(curr_data):
        os.mkdir(curr_data)

#engine = create_sql_engine(uid='database_login', password='password-123')

    query = 'SELECT * FROM [dbo].[{table}_Cleaned]'.format(table=t)
    df = pd.read_sql(query, engine, index_col='index')
    #Add group by to agg
    df.to_csv(f'./data/curr_data/{t}/Cleaned.csv')

SQL Alchemy Engine created


In [8]:
default_store.upload_files(['./data/curr_data/RATES/Cleaned.csv'], 
                           target_path = 'RATES', 
                           overwrite = True, 
                           show_progress = True)

Uploading an estimated of 1 files
Uploading ./data/curr_data/RATES/Cleaned.csv
Uploaded ./data/curr_data/RATES/Cleaned.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_51111766944d4f7c9e809193c3feda42

In [18]:
from azureml.data.data_reference import DataReference 
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep
import os
# python scripts folder
prepare_data_folder = './scripts/prepdata'

blob_rates_data = DataReference(
    datastore=default_store,
    data_reference_name="agg_rates_data",
    path_on_datastore="RATES/Cleaned.csv")

# Define output after cleansing step
agg_rates_data = PipelineData("agg_rates_data", datastore=default_store)

print('Aggregate script is in {}.'.format(os.path.realpath(prepare_data_folder)))

# cleansing step creation
# See the cleanse.py for details about input and output
aggRatesStep = PythonScriptStep(
    name="agg_rates_data",
    script_name="agg.py", 
    arguments=["--input_cleanse", blob_rates_data, 
               "--output_cleanse", agg_rates_data],
    inputs=[blob_rates_data],
    outputs=[agg_rates_data],
    compute_target=aml_compute,
    runconfig=aml_run_config,
    source_directory=prepare_data_folder,
    allow_reuse=True
)

print("aggRatesStep created.")

Aggregate script is in /home/nbuser/library/scripts/prepdata.
aggRatesStep created.


In [19]:
default_store.upload_files(['./data/curr_data/INFLATION/Cleaned.csv'], 
                           target_path = 'INFLATION', 
                           overwrite = True, 
                           show_progress = True)

Uploading an estimated of 1 files
Uploading ./data/curr_data/INFLATION/Cleaned.csv
Uploaded ./data/curr_data/INFLATION/Cleaned.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_bbd11ea67a0947ef8532766f2e32b4a9

In [20]:
from azureml.data.data_reference import DataReference 
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

# python scripts folder
prepare_data_folder = './scripts/prepdata'

blob_inflation_data = DataReference(
    datastore=default_store,
    data_reference_name="inflation_data",
    path_on_datastore="INFLATION/Cleaned.csv")

# Define output after cleansing step
agg_inflation_data = PipelineData("inflation_data", datastore=default_store)

print('Aggregate script is in {}.'.format(os.path.realpath(prepare_data_folder)))

# cleansing step creation
# See the cleanse.py for details about input and output
aggInflationStep = PythonScriptStep(
    name="Aggregate Inflation Data",
    script_name="agg.py", 
    arguments=["--input_cleanse", blob_inflation_data, 
               "--output_cleanse", agg_inflation_data],
    inputs=[blob_inflation_data],
    outputs=[agg_inflation_data],
    compute_target=aml_compute,
    runconfig=aml_run_config,
    source_directory=prepare_data_folder,
    allow_reuse=True
)

print("aggInflationStep created.")

Aggregate script is in /home/nbuser/library/scripts/prepdata.
aggInflationStep created.


In [21]:
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

pipeline_steps=[aggRatesStep,aggInflationStep]
pipeline = Pipeline(workspace = ws, steps=pipeline_steps)
print("Pipeline is built.")

Pipeline is built.


In [22]:
from azureml.core import Experiment

experiment = Experiment(ws,'FRED_Experiment')

print("Experiment created")

Experiment created


In [23]:
pipeline_run = experiment.submit(pipeline, regenerate_outputs=False)

print("Pipeline submitted for execution.")

Created step agg_rates_data [473d35d7][cb846918-b794-4184-a4f4-805993f852be], (This step will run and generate new outputs)
Created step Aggregate Inflation Data [2a4675ee][1f1ee6de-87ca-4dbb-816c-6d4876867fc0], (This step is eligible to reuse a previous run's output)
Using data reference INPUT_agg_rates_data for StepId [a96cd663][a4c1cdee-5427-4f25-9d37-98d223ecc85a], (Consumers of this data are eligible to reuse prior runs.)
Using data reference INPUT_inflation_data for StepId [8b53551a][fa0a5b59-bdbd-44ac-a257-181867a43276], (Consumers of this data are eligible to reuse prior runs.)
Submitted pipeline run: e64860df-f4e3-40f4-803b-30a4c941fceb
Pipeline submitted for execution.


In [24]:
RunDetails(pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

### Read Aggregated Rates Data

In [25]:
# Before we proceed we need to wait for the run to complete.
pipeline_run.wait_for_completion()

import azureml.dataprep as dprep
# functions to download output to local and fetch as dataframe
def get_download_path(download_path, output_name):
    output_folder = os.listdir(download_path + '/azureml')[0]
    path =  download_path + '/azureml/' + output_folder + '/' + output_name
    return path

def fetch_df(step, output_name):
    output_data = step.get_output_data(output_name)
    
    download_path = './outputs/' + output_name
    output_data.download(download_path)
    df_path = get_download_path(download_path, output_name) + '/part-00000'
    return dprep.auto_read_file(path=df_path)

PipelineRunId: e64860df-f4e3-40f4-803b-30a4c941fceb
Link to Portal: https://mlworkspace.azure.ai/portal/subscriptions/ea632140-cfc8-4028-bf65-ae3111412de0/resourceGroups/AYTML-rg/providers/Microsoft.MachineLearningServices/workspaces/AYTML/experiments/FRED_Experiment/runs/e64860df-f4e3-40f4-803b-30a4c941fceb

PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': 'e64860df-f4e3-40f4-803b-30a4c941fceb', 'status': 'Completed', 'startTimeUtc': '2019-11-08T17:35:47.870741Z', 'endTimeUtc': '2019-11-08T17:42:03.55306Z', 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': None, 'runType': 'HTTP', 'azureml.parameters': '{}'}, 'logFiles': {'logs/azureml/executionlogs.txt': 'https://aytml9430632465.blob.core.windows.net/azureml/ExperimentRun/dcid.e64860df-f4e3-40f4-803b-30a4c941fceb/logs/azureml/executionlogs.txt?sv=2019-02-02&sr=b&sig=JRk9D7Xno7fgSuALoUy6YvNNXUaWlRzCdxU2FFaLgWo%3D&st=2019-11-08T17%3A32%3A16Z&se=2019-11-09T01%3A42%3A16Z&sp=r', 'logs/azureml/stde

In [46]:
agg_Rates_Step = pipeline_run.find_step_run(aggRatesStep.name)[0]
print(aggRatesStep.name)
agg_Rates_df = fetch_df(agg_Rates_Step, agg_Rates_Step.name)
display(agg_Rates_df.head(5))

agg_rates_data




Unnamed: 0,period,value
0,1962-01-01,4.08
1,1962-04-01,3.95
2,1962-07-01,4.06
3,1962-10-01,4.07
4,1963-01-01,4.0


#### Create Train and Test DataFlows

In [70]:
test_data

pandas.core.frame.DataFrame

In [75]:
#print(agg_Rates_df.row_count * 0.8)
p_df=agg_Rates_df.to_pandas_dataframe()
train_data=p_df[:int(len(p_df) * 0.8)]
test_data=p_df[int(len(p_df) * 0.8):]
label='value'
#test_labels = test_data.pop(label)

X=train_data['period'].values
y=train_data['value'].values

### AutoML Step Run

In [31]:
from azureml.train.automl import AutoMLConfig

In [61]:
import logging
time_series_settings = {
    "time_column_name": "period",
    "max_horizon": 50
}

automl_config = AutoMLConfig(task='forecasting',
                             primary_metric='normalized_root_mean_squared_error',
                             experiment_timeout_minutes=15,
                             enable_early_stopping=True,
                             X=train_data,
                             y=y,
                             label='value',
                             n_cross_validations=5,
                             enable_ensembling=False,
                             verbosity=logging.INFO,
                             **time_series_settings)

In [62]:
from azureml.core import Experiment

experiment_aml = Experiment(ws,'FRED_Experiment_AutoML')

print("Experiment created")

Experiment created


In [63]:
pipeline_run_aml = experiment_aml.submit(automl_config, show_output=True)

print("Pipeline submitted for execution.")

Running on local machine
Parent Run ID: AutoML_a4c48a8c-da19-4fa1-8b07-720098456381
Current status: DatasetFeaturization. Beginning to featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating CV splits.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize 

In [64]:
#pipeline_run_aml = experiment_aml.submit(automl_config, show_output=True)

best_run, fitted_model = pipeline_run_aml.get_output()

In [79]:
actual_labels = test_data['value']

In [77]:
predict_labels = fitted_model.predict(test_data)

In [81]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse = sqrt(mean_squared_error(actual_labels, predict_labels))
rmse

0.006182752934849445

In [83]:
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
FRED_Experiment_AutoML,AutoML_a4c48a8c-da19-4fa1-8b07-720098456381_14,,Completed,Link to Azure Portal,Link to Documentation
