# Retrieve the MNIST data

In [None]:
import os
import urllib.request

print(os.getcwd())

#create a folder for the dataset
os.makedirs('./data/mnist', exist_ok = True)

# load dataset to the directory--as you can see, you must load train sets and test sets separately
urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', filename='./data/mnist/train-images.gz')
urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', filename='./data/mnist/train-labels.gz')
urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', filename='./data/mnist/test-images.gz')
urllib.request.urlretrieve('http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', filename='./data/mnist/test-labels.gz')

# Split out the datasets

In [None]:
import gzip
import numpy as np
import struct

# load compressed MNIST gz files and return numpy arrays
def load_data(filename, label=False):
    with gzip.open(filename) as gz:
        struct.unpack('I', gz.read(4))
        n_items = struct.unpack('>I', gz.read(4))
        if not label:
            n_rows = struct.unpack('>I', gz.read(4))[0]
            n_cols = struct.unpack('>I', gz.read(4))[0]
            res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
            res = res.reshape(n_items[0], n_rows * n_cols)
        else:
            res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
            res = res.reshape(n_items[0], 1)
    return res

print('Functions defined')

In [None]:
# To help the model converge faster, shrink the intensity values (X) from 0-255 to 0-1

X_train = load_data('./data/mnist/train-images.gz', False) / 255.0
y_train = load_data('./data/mnist/train-labels.gz', True).reshape(-1)

X_test = load_data('./data/mnist/test-images.gz', False) / 255.0
y_test = load_data('./data/mnist/test-labels.gz', True).reshape(-1)

print('Data loaded')

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

# Display sample images

In [None]:
%matplotlib inline 

import matplotlib.pyplot as plt
import numpy as np


count = 0
sample_size = 30
plt.figure(figsize = (16, 6))
for i in np.random.permutation(X_train.shape[0])[:sample_size]:
    count = count + 1
    plt.subplot(1, sample_size, count)
    plt.axhline('')
    plt.axvline('')
    plt.text(x=10, y=-10, s=y_train[i], fontsize=18)
    plt.imshow(X_train[i].reshape(28, 28), cmap=plt.cm.Greys)
    
plt.show()

print('Done')

# Train

## Local machine

In [None]:
from sklearn.linear_model import LogisticRegression

#load the model
clf = LogisticRegression()
#fit the model
clf.fit(X_train, y_train)

#evaluate the model by using a test set
y_hat = clf.predict(X_test)
#print the accuracy
print(np.average(y_hat == y_test))

print('Done')

## Use the Experimentation Service

### Load Azure Subscription details

In [None]:
import configparser

config = configparser.ConfigParser()
config.read('config.ini')

subscription_id = config['AZURE']['SUBSCRIPTION_ID']
resource_group = config['AZURE']['RESOURCE_GROUP']
region = 'westeurope'

### Create a workspace

In [None]:
import azureml.core

print(azureml.core.VERSION)

In [None]:
from azureml.core import Workspace

if os.path.exists(os.path.join(".", ".azureml", "config.json")):
    ws = Workspace.from_config()
else:
    ws = Workspace.create(name='AMLSLearnworkspace',
                          subscription_id=subscription_id, 
                          resource_group=resource_group,
                          create_resource_group = True,
                          location=region)
    
    print('AMLS Workspace created')    
    # Create the configuration file.
    ws.write_config()
    print('Configuration saved')    

In [None]:
# View workspace details
ws.get_details()

### Create the experiment

In [None]:
from azureml.core import Experiment

#Create an experiment
experiment = Experiment(workspace = ws, name = "amls-learn-experiment")

print('Experiment created')

### Create a remote compute target

In [None]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# Step 1: name the cluster and set the minimal and maximal number of nodes 
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpucluster")
min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 1)
max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 3)
idle_seconds_before_scaledown = 600

# Step 2: choose environment variables 
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")

provisioning_config = AmlCompute.provisioning_configuration(vm_size = vm_size, 
                                                            min_nodes = min_nodes, 
                                                            max_nodes = max_nodes, 
                                                            idle_seconds_before_scaledown = idle_seconds_before_scaledown)

# create the cluster
compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

print('Compute target created')

### Upload data to AMLS's data store

In [None]:
#upload data by using get_default_datastore()
ds = ws.get_default_datastore()
ds.upload(src_dir='./data', target_path='data', overwrite=True, show_progress=True)

print('Done')

### Create a modeling script

In [None]:
import os

# create the folder
folder_training_script = './trial_model_mnist'
os.makedirs(folder_training_script, exist_ok=True)

print('Done')

In [None]:
%%writefile $folder_training_script/train.py

import argparse
import os
import numpy as np
import glob

from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib

from azureml.core import Run
from azureml.core.model import Model
# from utils import load_data

import gzip
import struct


def str2bool(v):
    if isinstance(v, bool):
        return v
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise argparse.ArgumentTypeError('Boolean value expected.')

        
# load compressed MNIST gz files and return numpy arrays
def load_data(filename, label=False):
    with gzip.open(filename) as gz:
        struct.unpack('I', gz.read(4))
        n_items = struct.unpack('>I', gz.read(4))
        if not label:
            n_rows = struct.unpack('>I', gz.read(4))[0]
            n_cols = struct.unpack('>I', gz.read(4))[0]
            res = np.frombuffer(gz.read(n_items[0] * n_rows * n_cols), dtype=np.uint8)
            res = res.reshape(n_items[0], n_rows * n_cols)
        else:
            res = np.frombuffer(gz.read(n_items[0]), dtype=np.uint8)
            res = res.reshape(n_items[0], 1)
    return res


# let user feed in 2 parameters, the dataset to mount or download, 
# and the regularization rate of the logistic regression model
parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
parser.add_argument('--regularization', type=float, dest='reg', default=0.01, help='regularization rate')
parser.add_argument('--local', type=str2bool, dest='local', default=False,
                               help='Flag indicating where model training takes place')
args = parser.parse_args()

###
data_folder = os.path.join(args.data_folder, 'data', 'mnist')
print('Data folder:', data_folder)

# load the train and test set into numpy arrays
X_train = load_data(os.path.join(data_folder, 'train-images.gz'), False) / 255.0
X_test = load_data(os.path.join(data_folder, 'test-images.gz'), False) / 255.0

#print variable set dimension
print(X_train.shape, X_test.shape, sep = '\n')

y_train = load_data(os.path.join(data_folder, 'train-labels.gz'), True).reshape(-1)
y_test = load_data(os.path.join(data_folder, 'test-labels.gz'), True).reshape(-1)

#print the response variable dimension
print( y_train.shape, y_test.shape, sep = '\n')

# get hold of the current run
run = Run.get_context()

print('Train a logistic regression model with regularization rate of', args.reg)
clf = LogisticRegression(C=1.0/args.reg, solver="liblinear", multi_class="auto", random_state=42)
clf.fit(X_train, y_train)

print('Predict the test set')
y_hat = clf.predict(X_test)

# calculate accuracy on the prediction
acc = np.average(y_hat == y_test)
print('Accuracy is', acc)

run.log('regularization rate', np.float(args.reg))
run.log('accuracy', np.float(acc))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=clf, filename='outputs/sklearn_mnist_model.pkl')

# register an Azure ML model (it's only possible when training in the AMLS)
local = args.local
print(f"Local: {local}, {type(local)}")

if not local:
    print("Registering model...")   
    model = run.register_model(model_name='sklearn_mnist_model.pkl',
                               model_path='sklearn_mnist_model.pkl',
                               tags = {'area': "MNIST", 'type': "sklearn"},
                               description = "identify numbers")

#     model = Model.register(model_path='./outputs', 
#                            model_name='sklearn_mnist_model.pkl',
#                            description = "identify numbers")
    
    print(model.name, model.id, model.version, sep='\t')

#### Run the modeling script locally

> NOTE: By running the script locally we are making sure that we can also run the script successfully on the remote compute target

In [None]:
! python ./trial_model_mnist/train.py --data-folder=. --regularization=0.6 --local=True

### Submit the run

In [None]:
# create a configuration for the run

from azureml.train.sklearn import SKLearn

script_params = {
    '--data-folder': ds.as_mount(),
    '--regularization': 0.6,
    '--local': False
}

#import the Scikit-learn package 
est = SKLearn(source_directory=folder_training_script,
                script_params=script_params,
                compute_target=compute_target,
                entry_script='train.py',
                conda_packages=['scikit-learn'])

In [None]:
run = experiment.submit(config=est)
run

In [None]:
# Get the result
print(run.get_metrics())