In [1]:
import pandas as pd
import numpy as np
import warnings
from math import sqrt
warnings.filterwarnings('ignore')
from azureml.core.run import Run
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.model import Model
from azureml.core.authentication import ServicePrincipalAuthentication
from azureml.train.automl import AutoMLConfig
import pickle
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import mlflow
from azureml.core import Dataset

In [2]:
from azureml.core import Workspace, Dataset

subscription_id = '292890d4-aa6d-4d5e-a085-97c80db3c30a'
resource_group = 'MLOpsRG'
workspace_name = 'MLOpsWS'

workspace = Workspace(subscription_id, resource_group, workspace_name)

In [3]:
uri = workspace.get_mlflow_tracking_uri()
mlflow.set_tracking_uri(uri)

In [4]:
print(uri)

azureml://australiaeast.api.azureml.ms/mlflow/v1.0/subscriptions/292890d4-aa6d-4d5e-a085-97c80db3c30a/resourceGroups/MLOpsRG/providers/Microsoft.MachineLearningServices/workspaces/MLOpsWS?


In [6]:
# Importing pre-processed dataset
dataset = Dataset.get_by_name(workspace, name='processed_weather_data_portofTurku')
print(dataset.name, dataset.version)

processed_weather_data_portofTurku 1


In [7]:
df = dataset.to_pandas_dataframe()

In [8]:
df.head()

Unnamed: 0,Timestamp,Location,Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Current_weather_condition,Future_weather_condition
0,2006-04-01 02:00:00,"Port of Turku, Finland",8.755556,0.83,11.0446,259,15.8263,1016.51,1,1
1,2006-04-01 03:00:00,"Port of Turku, Finland",9.222222,0.85,13.9587,258,14.9569,1016.66,1,1
2,2006-04-01 04:00:00,"Port of Turku, Finland",7.733333,0.95,12.3648,259,9.982,1016.72,1,1
3,2006-04-01 05:00:00,"Port of Turku, Finland",8.772222,0.89,14.1519,260,9.982,1016.84,1,1
4,2006-04-01 06:00:00,"Port of Turku, Finland",10.822222,0.82,11.3183,259,9.982,1017.37,1,1


# Spliting Pre-Processed data into Training and Validation datasets

In [9]:
# Validation set is used later to evaluate model performance post training. 

In [10]:
df_training = df.iloc[:77160]

In [11]:
df_training.shape

(77160, 10)

In [12]:
df_validation = df.drop(df_training.index)

In [13]:
df_validation.shape

(19289, 10)

# Registering Training and Validation data to the datastore on the workspace. 

In [14]:
!mkdir Data

mkdir: cannot create directory ‘Data’: File exists


In [15]:
df_training.to_csv('Data/training_data.csv',index=False)

In [16]:
df_validation.to_csv('Data/validation_data.csv',index=False)

In [17]:
datastore = workspace.get_default_datastore()

In [18]:
# Deprecated
# datastore.upload(src_dir='Data', target_path='data')

# from azureml.core import Dataset
# https://azure.github.io/azureml-cheatsheets/docs/cheatsheets/python/v1/data/
Dataset.File.upload_directory(src_dir="Data", target=(datastore, "data"))

Validating arguments.
Arguments validated.
Uploading file to data
Uploading an estimated of 2 files
Uploading Data/validation_data.csv
Uploaded Data/validation_data.csv, 1 files out of an estimated total of 2
Uploading Data/training_data.csv
Uploaded Data/training_data.csv, 2 files out of an estimated total of 2
Uploaded 2 files
Creating new dataset


{
  "source": [
    "('workspaceblobstore', '/data')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ]
}

In [19]:
training_dataset = Dataset.Tabular.from_delimited_files(datastore.path('data/training_data.csv'))

In [20]:
validation_dataset = Dataset.Tabular.from_delimited_files(datastore.path('data/validation_data.csv'))

In [21]:
training_ds = training_dataset.register(workspace=workspace,
                                 name='training_dataset',
                                 description='Dataset to use for ML training')

In [22]:
validation_ds = validation_dataset.register(workspace=workspace,
                                 name='validation_dataset',
                                 description='Dataset for validation ML models')

# Data ingestion step - Training dataset

In [23]:
dataset = Dataset.get_by_name(workspace, name='training_dataset')
print(dataset.name, dataset.version)

training_dataset 1


In [24]:
df = dataset.to_pandas_dataframe()

In [25]:
df.head()

Unnamed: 0,Timestamp,Location,Temperature_C,Humidity,Wind_speed_kmph,Wind_bearing_degrees,Visibility_km,Pressure_millibars,Current_weather_condition,Future_weather_condition
0,2006-04-01 02:00:00,"Port of Turku, Finland",8.755556,0.83,11.0446,259,15.8263,1016.51,1,1
1,2006-04-01 03:00:00,"Port of Turku, Finland",9.222222,0.85,13.9587,258,14.9569,1016.66,1,1
2,2006-04-01 04:00:00,"Port of Turku, Finland",7.733333,0.95,12.3648,259,9.982,1016.72,1,1
3,2006-04-01 05:00:00,"Port of Turku, Finland",8.772222,0.89,14.1519,260,9.982,1016.84,1,1
4,2006-04-01 06:00:00,"Port of Turku, Finland",10.822222,0.82,11.3183,259,9.982,1017.37,1,1


In [26]:
df.shape

(77160, 10)

#### Feature Selection and scaling

In [27]:
X = df[['Temperature_C', 'Humidity', 'Wind_speed_kmph', 'Wind_bearing_degrees', 'Visibility_km', 'Pressure_millibars', 'Current_weather_condition']].values
y = df['Future_weather_condition'].values
y

array([1, 1, 1, ..., 1, 1, 1])

In [28]:
# Splitting the Training dataset into Train and Test set for ML training
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [29]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [30]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Model training and Testing Step

## 1. Support Vector Machine

In [31]:
myexperiment = Experiment(workspace, "support-vector-machine")
mlflow.set_experiment("mlflow-support-vector-machine")

2022/10/10 10:08:05 INFO mlflow.tracking.fluent: Experiment with name 'mlflow-support-vector-machine' does not exist. Creating a new experiment.


<Experiment: artifact_location='', experiment_id='2c836679-a501-4918-bb7d-5ae0055e2f78', lifecycle_stage='active', name='mlflow-support-vector-machine', tags={}>

In [32]:
#from sklearn.svm import SVC
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

In [33]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

In [34]:
svc = svm.SVC()

In [35]:
# TODO MAHTAB - Did not understand why we need AzureML run and mlflow run both?
# initialize a run in Azureml and mlflow experiments
run = myexperiment.start_logging()
mlflow.start_run()


run.log("dataset name", dataset.name)
run.log("dataset Version", dataset.version)

In [36]:
svc_grid = GridSearchCV(svc, parameters)

In [38]:
%%time
svc_grid.fit(X_train, y_train)

CPU times: user 10min 52s, sys: 588 ms, total: 10min 52s
Wall time: 10min 52s


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [39]:
svc_grid.get_params(deep=True)

{'cv': None,
 'error_score': nan,
 'estimator__C': 1.0,
 'estimator__break_ties': False,
 'estimator__cache_size': 200,
 'estimator__class_weight': None,
 'estimator__coef0': 0.0,
 'estimator__decision_function_shape': 'ovr',
 'estimator__degree': 3,
 'estimator__gamma': 'scale',
 'estimator__kernel': 'rbf',
 'estimator__max_iter': -1,
 'estimator__probability': False,
 'estimator__random_state': None,
 'estimator__shrinking': True,
 'estimator__tol': 0.001,
 'estimator__verbose': False,
 'estimator': SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=False),
 'iid': 'deprecated',
 'n_jobs': None,
 'param_grid': {'kernel': ('linear', 'rbf'), 'C': [1, 10]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': None,
 'verbose': 0}

In [40]:
from sklearn.svm import SVC

In [41]:
svc = SVC(C=svc_grid.get_params(deep=True)['estimator__C'], kernel=svc_grid.get_params(deep=True)['estimator__kernel'])

In [42]:
svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [43]:
# Logging training parameters to AzureML and MLFlow experiments
run.log("C", svc_grid.get_params(deep=True)['estimator__C'])
run.log("Kernel", svc_grid.get_params(deep=True)['estimator__kernel'])

In [44]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [45]:
predicted_svc = svc.predict(X_test)

In [46]:
acc = accuracy_score(y_test, predicted_svc)

In [47]:
fscore = f1_score(y_test, predicted_svc, average="macro")
precision = precision_score(y_test, predicted_svc, average="macro")
recall = recall_score(y_test, predicted_svc, average="macro")

In [48]:
print(fscore)

0.8864428755463128


In [50]:
import git
repo = git.Repo(search_parent_directories=True)
sha = repo.head.object.hexsha

In [51]:
# Log to AzureML and MLflow
run.log("Test_accuracy", acc)
run.log("Precision", precision)
run.log("Recall", recall)
run.log("F-Score", fscore)
run.log("Git-sha", sha)

In [52]:
run.complete()
print ("run id:", run.id)

run id: 1e6622ce-1534-4ef4-96b9-28f7cd63b63d


In [53]:
mlflow.end_run()

In [54]:
run.get_metrics()

{'dataset name': 'training_dataset',
 'dataset Version': 1,
 'C': 1.0,
 'Kernel': 'rbf',
 'Test_accuracy': 0.9519180922757906,
 'Precision': 0.8869828453699851,
 'Recall': 0.8859050416892464,
 'F-Score': 0.8864428755463128,
 'Git-sha': '80767a16437e8bace0463ae04d942313e1181754'}

In [55]:
workspace.get_details()

{'id': '/subscriptions/292890d4-aa6d-4d5e-a085-97c80db3c30a/resourceGroups/MLOpsRG/providers/Microsoft.MachineLearningServices/workspaces/MLOpsWS',
 'name': 'MLOpsWS',
 'identity': {'principal_id': '39d61691-35f1-4e0a-a314-1252d9cd877e',
  'tenant_id': '03ff20ec-51f3-415c-9462-b61ddcf1ce16',
  'type': 'SystemAssigned'},
 'location': 'australiaeast',
 'type': 'Microsoft.MachineLearningServices/workspaces',
 'tags': {},
 'sku': 'Basic',
 'workspaceid': '31f4ef02-a30d-4a01-993f-1fb82d100b21',
 'sdkTelemetryAppInsightsKey': 'bd22fe07-ca7e-4337-bd98-3d5116261c1f',
 'description': '',
 'friendlyName': 'MLOpsWS',
 'containerRegistry': '/subscriptions/292890d4-aa6d-4d5e-a085-97c80db3c30a/resourceGroups/MLOpsRG/providers/Microsoft.ContainerRegistry/registries/mlopscr1010',
 'keyVault': '/subscriptions/292890d4-aa6d-4d5e-a085-97c80db3c30a/resourceGroups/MLOpsRG/providers/Microsoft.Keyvault/vaults/mlopskv1010',
 'applicationInsights': '/subscriptions/292890d4-aa6d-4d5e-a085-97c80db3c30a/resourceG

In [56]:
import mlflow.sklearn
mlflow.sklearn.log_model(svc, 'outputs')

ModelInfo(artifact_path='outputs', flavors={'python_function': {'model_path': 'model.pkl', 'loader_module': 'mlflow.sklearn', 'python_version': '3.8.5', 'env': 'conda.yaml'}, 'sklearn': {'pickled_model': 'model.pkl', 'sklearn_version': '0.22.1', 'serialization_format': 'cloudpickle', 'code': None}}, model_uri='runs:/aa70964f-610d-4b42-9df7-2d17c02ae562/outputs', model_uuid='f898084a67af4cd284918ff519f16c60', run_id='aa70964f-610d-4b42-9df7-2d17c02ae562', saved_input_example_info=None, signature_dict=None, utc_time_created='2022-10-10 10:35:33.475480', mlflow_version='1.28.0')

In [58]:
# Added this to end the run; it was still running after previous statement
mlflow.end_run()

Random Forest classifier 

In [64]:
myexperiment = Experiment(workspace, "random-forest-classifier")
mlflow.set_experiment("mlflow-random-forest-classifier")

<Experiment: artifact_location='', experiment_id='b65bcfdb-8b0a-49cb-b2f4-2a212edf68c4', lifecycle_stage='active', name='mlflow-random-forest-classifier', tags={}>

In [65]:
from sklearn.ensemble import RandomForestClassifier

In [66]:
rf = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=100)

In [68]:
# initialize runs in Azureml and mlflow
run = myexperiment.start_logging()
mlflow.start_run()
# mlflow.start_run(nested=True)


# Log dataset used 
run.log("dataset name", dataset.name)
run.log("dataset Version", dataset.version)

In [69]:
%%time
rf.fit(X_train, y_train)

CPU times: user 6.57 s, sys: 12 ms, total: 6.59 s
Wall time: 6.57 s


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [70]:
# Logging training parameters to AzureML and MLFlow experiments
run.log("max_depth", 10)
run.log("random_state", 0)
run.log("n_estimators", 100)

In [71]:
predicted_rf = rf.predict(X_test)

In [72]:
acc = accuracy_score(y_test, predicted_rf)
fscore = f1_score(y_test, predicted_rf, average="macro")
precision = precision_score(y_test, predicted_rf, average="macro")
recall = recall_score(y_test, predicted_rf, average="macro")

In [73]:
run.log("Test_accuracy", acc)
run.log("Precision", precision)
run.log("Recall", recall)
run.log("F-Score", fscore)
run.log("Git-sha", sha)

In [74]:
run.complete()
print ("run id:", run.id)

run id: c388a1e1-f9ba-4d84-b054-698440bb1265


In [75]:
mlflow.end_run()

In [76]:
run.get_metrics()

{'dataset name': 'training_dataset',
 'dataset Version': 1,
 'max_depth': 10,
 'random_state': 0,
 'n_estimators': 100,
 'Test_accuracy': 0.9548989113530326,
 'Precision': 0.9018705246237031,
 'Recall': 0.8804084310202218,
 'F-Score': 0.8907272822498857,
 'Git-sha': '9f9c93443b9aa1cc9dbe29702e53b95f4059eab9'}

# Model Packaging Step

pickle file or onnx

In [77]:
# Convert into SVC model into ONNX format file
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, 6]))]
onx = convert_sklearn(svc, initial_types=initial_type)
with open("outputs/svc.onnx", "wb") as f:
    f.write(onx.SerializeToString())

The maximum opset needed by this model is only 1.


In [78]:
# Convert into RF model into ONNX format file
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, 6]))]
onx = convert_sklearn(rf, initial_types=initial_type)
with open("outputs/rf.onnx", "wb") as f:
    f.write(onx.SerializeToString())

The maximum opset needed by this model is only 1.
The maximum opset needed by this model is only 9.


# Model Registering Step

In [79]:
# Register Model on AzureML WS
model = Model.register(model_path = './outputs/svc.onnx', # this points to a local file 
                       model_name = "support-vector-classifier", # this is the name the model is registered as
                       tags = {'dataset': dataset.name, 'version': dataset.version, 'hyparameter-C': '1', 'testdata-accuracy': '0.9519'}, 
                       model_framework='pandas==0.23.4',
                       description = "Support vector classifier to predict weather at port of Turku",
                       workspace = workspace)

print('Name:', model.name)
print('Version:', model.version)

Registering model support-vector-classifier
Name: support-vector-classifier
Version: 1


In [80]:
# Register Model on AzureML WS
model = Model.register(model_path = './outputs/rf.onnx', # this points to a local file 
                       model_name = "random-forest-classifier", # this is the name the model is registered as
                       tags = {'dataset': dataset.name, 'version': dataset.version, 'hyparameter-C': '1', 'testdata-accuracy': '0.9548'}, 
                       model_framework='pandas==0.23.4',
                       description = "Random forest classifier to predict weather at port of Turku",
                       workspace = workspace)

print('Name:', model.name)
print('Version:', model.version)

Registering model random-forest-classifier
Name: random-forest-classifier
Version: 1


In [81]:
import mlflow.sklearn

In [82]:
# Save the model to the outputs directory for capture
mlflow.sklearn.log_model(svc, 'outputs/svc.onnx')

ModelInfo(artifact_path='outputs/svc.onnx', flavors={'python_function': {'model_path': 'model.pkl', 'loader_module': 'mlflow.sklearn', 'python_version': '3.8.5', 'env': 'conda.yaml'}, 'sklearn': {'pickled_model': 'model.pkl', 'sklearn_version': '0.22.1', 'serialization_format': 'cloudpickle', 'code': None}}, model_uri='runs:/14b34d1f-68fa-4df5-a9f6-5129d05e100b/outputs/svc.onnx', model_uuid='54fda8271a80487bb07232ad1e007364', run_id='14b34d1f-68fa-4df5-a9f6-5129d05e100b', saved_input_example_info=None, signature_dict=None, utc_time_created='2022-10-10 01:46:44.569356', mlflow_version='1.28.0')

In [83]:
# Save the model to the outputs directory for capture
mlflow.sklearn.log_model(rf, 'outputs/rf.onnx')

ModelInfo(artifact_path='outputs/rf.onnx', flavors={'python_function': {'model_path': 'model.pkl', 'loader_module': 'mlflow.sklearn', 'python_version': '3.8.5', 'env': 'conda.yaml'}, 'sklearn': {'pickled_model': 'model.pkl', 'sklearn_version': '0.22.1', 'serialization_format': 'cloudpickle', 'code': None}}, model_uri='runs:/14b34d1f-68fa-4df5-a9f6-5129d05e100b/outputs/rf.onnx', model_uuid='db6fff382b974f698f75baec8709b7ca', run_id='14b34d1f-68fa-4df5-a9f6-5129d05e100b', saved_input_example_info=None, signature_dict=None, utc_time_created='2022-10-10 01:46:48.710966', mlflow_version='1.28.0')

# Save model artefacts

In [84]:
import pickle

with open('./outputs/scaler.pkl', 'wb') as scaler_pkl:
    pickle.dump(sc, scaler_pkl)

In [85]:
# Register Model on AzureML WS
scaler = Model.register(model_path = './outputs/scaler.pkl', # this points to a local file 
                       model_name = "scaler", # this is the name the model is registered as
                       tags = {'dataset': dataset.name, 'version': dataset.version}, 
                       model_framework='pandas==0.23.4',
                       description = "Scaler used for scaling incoming inference data",
                       workspace = workspace)

print('Name:', scaler.name)
print('Version:', scaler.version)

Registering model scaler
Name: scaler
Version: 1
