In [1]:
from azureml.core.runconfig import RunConfiguration
from azureml.core import Workspace, Experiment, ScriptRunConfig
import json
from azureml.core.authentication import AzureCliAuthentication
#from sklearn.externals import joblib
import joblib

In [6]:
#Service Principal Authentication

from azureml.core.authentication import ServicePrincipalAuthentication
import os

base_dir='./configuration'
config_json = os.path.join(base_dir, "config.json")
with open(config_json, "r") as f:
    config = json.load(f)

workspace_name = config["workspace_name"]
resource_group = config["resource_group"]
subscription_id = config["subscription_id"]
location = config["location"]
    
auth = ServicePrincipalAuthentication(
    tenant_id=config["tenant_id"],
    service_principal_id=config["service_principal_id"],
    service_principal_password=config["service_principal_password"],
)

ws = Workspace.get(
        name=workspace_name,
        subscription_id=subscription_id,
        resource_group=resource_group,        
        auth=auth
    )

print('Workspace name: ' + ws.name, 'Azure region: ' + ws.location, 'Subscription id: ' + ws.subscription_id,'Resource group: ' + ws.resource_group, sep='\n')

Workspace name: ShivaMLservice
Azure region: westus2
Subscription id: 46668180-b0ad-4a49-bed9-88f16f315dce
Resource group: MLGroup


In [2]:
#Interactive Authentication

with open("./configuration/config.json") as f:
    config = json.load(f)

workspace_name = config["workspace_name"]
resource_group = config["resource_group"]
subscription_id = config["subscription_id"]
location = config["location"]


ws = Workspace.get(
        name=workspace_name,
        subscription_id=subscription_id,
        resource_group=resource_group,
    )

#print('Workspace name: ' + ws.name, 'Azure region: ' + ws.location, 'Subscription id: ' + ws.subscription_id,'Resource group: ' + ws.resource_group, sep='\n')

In [3]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get('AML_COMPUTE_CLUSTER_NAME', 'standard-cluster')
compute_min_nodes = os.environ.get('AML_COMPUTE_CLUSTER_MIN_NODES', 0)
compute_max_nodes = os.environ.get('AML_COMPUTE_CLUSTER_MAX_NODES', 4)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get('AML_COMPUTE_CLUSTER_SKU', 'STANDARD_D2_V2')


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                                min_nodes=compute_min_nodes, 
                                                                max_nodes=compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
    
    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
     # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

found compute target. just use it. standard-cluster


In [12]:
compute_target.name

'standard-cluster'

In [4]:
datastore = ws.get_default_datastore()
datastore.upload_files(files = ['./data/robberies.csv'],
                       target_path = 'timeseries-dataset/tabular/',
                       overwrite = True,
                       show_progress = True)

Uploading an estimated of 1 files
Uploading ./data/robberies.csv
Uploaded ./data/robberies.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_407c42fb54df478cb654c4cf6204c333

In [6]:
from azureml.core import Dataset
dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'timeseries-dataset/tabular/robberies.csv')])

# preview the first 3 rows of the dataset
dataset.take(3).to_pandas_dataframe()

Unnamed: 0,Month,Monthly Boston armed robberies Jan.1966-Oct.1975 Deutsch and Alt (1977)
0,1966-01,41
1,1966-02,39
2,1966-03,50


In [21]:
dataset = dataset.register(workspace = ws,
                           name = 'robberies',
                           description='training dataset for arima forecasting',
                           create_new_version=True)

In [8]:
script_folder = os.path.join(os.getcwd(), 'scripts', 'training')
script_folder

'/home/nbuser/library/scripts/training'

In [9]:
%%writefile $script_folder/arima_amlcompute.py

import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.externals import joblib

from pandas import Grouper
#from pandas.plotting import lag_plot
#from pandas.plotting import autocorrelation_plot
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
#from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
#from sklearn.model_selection import TimeSeriesSplit
#from statsmodels.graphics.gofplots import qqplot
#from statsmodels.tsa.ar_model import AR
from statsmodels.tsa.arima_model import ARIMA

from azureml.core import Dataset, Run

run = Run.get_context()
# get input dataset by name
#dataset = run.input_datasets['robberies']

ws = run.experiment.workspace
dataset = Dataset.get_by_name(workspace=ws, name='robberies')

df = dataset.to_pandas_dataframe()
df.index = df['Month']
df = df.drop('Month', axis=1)
df.columns = ['Robberies']
series = pd.Series(df['Robberies'])

def mean_and_variance(X):
    split = int(len(X) / 2)
    X1, X2 = X[0:split], X[split:]
    mean1, mean2 = X1.mean(), X2.mean()
    var1, var2 = X1.var(), X2.var()
    print('mean1=%f, mean2=%f' % (mean1, mean2))
    print('variance1=%f, variance2=%f' % (var1, var2))
    
mean_and_variance(series.values)

def fuller_test(X):
    result = adfuller(X)
    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical Values:')
    for key, value in result[4].items():
    	print('\t%s: %.3f' % (key, value))
        
fuller_test(series.values)

plot_acf(series)

plot_pacf(series)

X = series.values
size = int(len(X) * 0.66)
train, test = X[0:size], X[size:len(X)]

model = ARIMA(train, order=(4,2,1))
model_fit = model.fit(disp=0)
print(model_fit.summary())

# plot residual errors
residuals = pd.DataFrame(model_fit.resid)
residuals.plot()
plt.show()
residuals.plot(kind='kde')
plt.show()
print(residuals.describe())

predictions=model_fit.forecast(steps=test.size)[0]

mse = mean_squared_error(test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(test,predictions)
print('Test RMSE: %.3f' % rmse)
print('Test R2: %.3f' % r2)

#### ROLLING FORECAST

history = [x for x in train]
predictions = list()
for t in range(len(test)):
    model = ARIMA(history, order=(4,2,1))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test[t]
    history.append(obs)
    print('predicted=%f, expected=%f' % (yhat, obs))
error = mean_squared_error(test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(test,predictions)
print('Test RMSE: %.3f' % rmse)
print('Test R2: %.3f' % r2)

# plot
plt.plot(test)
plt.plot(predictions, color='red')
plt.show()

run.log('RMSE', rmse)
run.log('R2', r2)

model_file_name = 'arima_model.pkl'

os.makedirs('./outputs', exist_ok=True)
with open(model_file_name, 'wb') as file:
    joblib.dump(value=model_fit, filename='outputs/' + model_file_name)

Writing /home/nbuser/library/scripts/training/arima_amlcompute.py


In [24]:
#Create Docker based environment with scikit-learn installed.
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies

myenv = Environment("myenv")

myenv.docker.enabled = True
myenv.python.conda_dependencies = CondaDependencies.create(pip_packages=['azureml-sdk', 'azureml-dataprep[pandas,fuse]','scikit-learn', 'matplotlib','statsmodels','seaborn'])

In [25]:
# Get an experiment object from Azure Machine Learning
experiment = Experiment(workspace=ws, name="arima_amlcompute")

In [26]:
from azureml.core import ScriptRunConfig
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

src = ScriptRunConfig(source_directory=script_folder, script='arima_amlcompute.py')

# Set compute target to the one created in previous step
src.run_config.target = compute_target.name

# Set environment
src.run_config.environment = myenv
 
run = experiment.submit(config=src)
run

Experiment,Id,Type,Status,Details Page,Docs Page
arima_amlcompute,arima_amlcompute_1587018381_f99aed14,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation


In [28]:
from azureml.widgets import RunDetails

# monitor the run
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [29]:
run.get_metrics()

{'RMSE': 63.698300467459504, 'R2': 0.33988792864021244}