In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pandas import TimeGrouper
from pandas.tools.plotting import lag_plot
from pandas.tools.plotting import autocorrelation_plot
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.graphics.gofplots import qqplot
from statsmodels.tsa.ar_model import AR
from statsmodels.tsa.arima_model import ARIMA

In [2]:
import azureml.core
from azureml.core import Experiment, Workspace

# Check core SDK version number
print("This notebook was created using version 1.0.2 of the Azure ML SDK")
print("You are currently using version", azureml.core.VERSION, "of the Azure ML SDK")
print("")


ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

This notebook was created using version 1.0.2 of the Azure ML SDK
You are currently using version 1.0.72 of the Azure ML SDK

Workspace name: ShivaMLservice
Azure region: westus2
Subscription id: 46668180-b0ad-4a49-bed9-88f16f315dce
Resource group: MLGroup


In [3]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get('AML_COMPUTE_CLUSTER_NAME', 'cpu-cluster')
compute_min_nodes = os.environ.get('AML_COMPUTE_CLUSTER_MIN_NODES', 0)
compute_max_nodes = os.environ.get('AML_COMPUTE_CLUSTER_MAX_NODES', 4)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get('AML_COMPUTE_CLUSTER_SKU', 'STANDARD_D2_V2')


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                                min_nodes=compute_min_nodes, 
                                                                max_nodes=compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
    
    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
     # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

found compute target. just use it. cpu-cluster


In [7]:
datastore = ws.get_default_datastore()
datastore.upload_files(files = ['./data/robberies.csv'],
                       target_path = 'timeseries-dataset/tabular/',
                       overwrite = True,
                       show_progress = True)

Uploading an estimated of 1 files
Uploading ./data/robberies.csv
Uploaded ./data/robberies.csv, 1 files out of an estimated total of 1
Uploaded 1 files


$AZUREML_DATAREFERENCE_e87a8d27834e4d01ad7fdbd06677e1e4

In [8]:
from azureml.core import Dataset
dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'timeseries-dataset/tabular/robberies.csv')])

# preview the first 3 rows of the dataset
dataset.take(3).to_pandas_dataframe()

Unnamed: 0,Month,Monthly Boston armed robberies Jan.1966-Oct.1975 Deutsch and Alt (1977)
0,1966-01,41
1,1966-02,39
2,1966-03,50


In [37]:
from azureml.core import Dataset

dataset = Dataset.File.from_files(path = [(datastore, 'timeseries-dataset/tabular/')])

# see a list of files referenced by dataset
dataset.to_path()

array(['/robberies.csv', '/shampoo-sales.csv'], dtype=object)

In [21]:
dataset = dataset.register(workspace = ws,
                           name = 'robberies',
                           description='training dataset for arima forecasting',
                           create_new_version=True)

In [10]:
script_folder = os.path.join(os.getcwd(), 'scripts')
script_folder

'/home/nbuser/library/scripts'

In [44]:
%%writefile $script_folder/arima.py

import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.externals import joblib


#from pandas.tools.plotting import lag_plot
from pandas.tools.plotting import autocorrelation_plot
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
#from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
#from sklearn.model_selection import TimeSeriesSplit
#from statsmodels.graphics.gofplots import qqplot
#from statsmodels.tsa.ar_model import AR
from statsmodels.tsa.arima_model import ARIMA

run = Run.get_context()
# get input dataset by name
dataset = run.input_datasets['robberies']

df = dataset.to_pandas_dataframe()
df.index = df['Month']
series = pd.Series(df)

def mean_and_variance(X):
    split = int(len(X) / 2)
    X1, X2 = X[0:split], X[split:]
    mean1, mean2 = X1.mean(), X2.mean()
    var1, var2 = X1.var(), X2.var()
    print('mean1=%f, mean2=%f' % (mean1, mean2))
    print('variance1=%f, variance2=%f' % (var1, var2))
    
mean_and_variance(series.values)

def fuller_test(X):
    result = adfuller(X)
    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical Values:')
    for key, value in result[4].items():
    	print('\t%s: %.3f' % (key, value))
        
fuller_test(series.values)

plot_acf(series)

plot_pacf(series)

X = series.values
size = int(len(X) * 0.66)
train, test = X[0:size], X[size:len(X)]

model = ARIMA(train, order=(4,2,1))
model_fit = model.fit(disp=0)
print(model_fit.summary())

# plot residual errors
residuals = pd.DataFrame(model_fit.resid)
residuals.plot()
plt.show()
residuals.plot(kind='kde')
plt.show()
print(residuals.describe())

predictions=model_fit.forecast(steps=13)[0]

mse = mean_squared_error(test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(test,predictions)
print('Test RMSE: %.3f' % rmse)
print('Test R2: %.3f' % r2)

run.log('RMSE', rmse)
run.log('R2', r2)

model_file_name = 'arima_amlcompute.pkl'

os.makedirs('./outputs', exist_ok=True)
with open(model_file_name, 'wb') as file:
    joblib.dump(value=model_fit, filename='outputs/' + model_file_name)

Overwriting /home/nbuser/library/scripts/arima.py


In [45]:
from azureml.train.sklearn import SKLearn

est = SKLearn(source_directory=script_folder, 
              entry_script='arima.py', 
              # pass dataset object as an input with name 'titanic'
              inputs=[dataset.as_named_input('robberies')],
              pip_packages=['azureml-sdk',
                            '[azureml-dataprep[fuse]', 
                            'matplotlib',
                            'statsmodels'],
              compute_target=compute_target) 

In [46]:
# Get an experiment object from Azure Machine Learning
experiment = Experiment(workspace=ws, name="arima_amlcompute")

run = experiment.submit(est)

# Create a run object in the experiment
run =  experiment.start_logging()

In [47]:
from azureml.widgets import RunDetails

# monitor the run
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [48]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies

conda_env = Environment('conda-env')
conda_env.python.conda_dependencies = CondaDependencies.create(pip_packages=['azureml-sdk',
                                                                             'azureml-dataprep[pandas,fuse]',
                                                                             'scikit-learn',
                                                                             'matplotlib',
                                                                             'statsmodels'])

In [None]:
conda_env.python.pip_dependencies = 
#test pip dependencies

In [49]:
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(source_directory=script_folder, 
                      script='arima.py', 
                      # to mount the dataset on the remote compute and pass the mounted path as an argument to the training script
                      arguments =[dataset.as_named_input('robberies').as_mount()])

src.run_config.framework = 'python'
src.run_config.environment = conda_env
src.run_config.target = compute_target.name


In [50]:
run = experiment.submit(config=src)


In [51]:
# monitor the run
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…