## Importing Libraries

In [None]:
pip install azureml-mlflow azureml-sdk scikit-learn skl2onnx

In [1]:
## core python libraries
from math import sqrt
import pickle

## libraries for data preprocessing
import pandas as pd
import numpy as np

## libraries for data visualization
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

## librarires for project trancking 
import mlflow
from azureml.core.run import Run
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace , Dataset
from azureml.core.model import Model 
from azureml.core.authentication import ServicePrincipalAuthentication 
from azureml.train.automl import AutoMLImageConfig 

## libraries for splitting dataset
from sklearn.model_selection import train_test_split

## libraries for scaling the features 
from sklearn.preprocessing import StandardScaler

## libraries for training ml models
from sklearn.svm import SVC
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

## libraries for hyparameter tunning
from sklearn.model_selection import GridSearchCV

## libraries for computing metrics score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## libraries for model packaging
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

## muting errors
import warnings 
warnings.filterwarnings('ignore')

## Setting Up MLflow

In [None]:
## lets get placeholder variables 
subscription_id = '5d6d4b4a-f629-47f1-a748-1d80f9a6031a'
resource_group = 'Learn_MLOps'
workspace_name = 'MLOps_WS'

In [None]:
## setting up a workspace
workspace = Workspace(subscription_id, resource_group, workspace_name)

In [None]:
## lets get a tracking ID for where MLflow exp and artifacts would be logged
uri = workspace.get_mlflow_tracking_uri( )
## lets connect to the tracking ID
mlflow.set_tracking_uri(uri)

In [None]:
## Importing pre-processed dataset
dataset = Dataset.get_by_name(workspace, name='weather_ds_portofTurku')
print(dataset.name, dataset.version)

In [None]:
## lets convert the dataset into pandas dataframe
df = dataset.to_pandas_dataframe()

### Setting Up A Validation Framework

In [None]:

df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_valid = train_test_split(df_train_full, test_size=0.25, random_state=11)


print(f'Size Of Full Training Dataset: {len(df_train_full)}')
print(f'Size Of Training Dataset: {len(df_train)}')
print(f'Size Of Validation Dataset: {len(df_valid)}')
print(f'Size Of Testing Dataset: {len(df_test)}')

In [None]:

df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=11)
df_train, df_valid = train_test_split(df_train_full, test_size=0.25, random_state=11)


print(f'Size Of Full Training Dataset: {len(df_train_full)}')
print(f'Size Of Training Dataset: {len(df_train)}')
print(f'Size Of Validation Dataset: {len(df_valid)}')
print(f'Size Of Testing Dataset: {len(df_test)}')

In [None]:
### Saving The Results of Splitting Dataset 

df_train_full.to_csv('../dataset/full_training_data.csv', index=False)
df_train.to_csv('../dataset/trianing_data.csv', index=False)
df_valid.to_csv('../dataset/validation_data.csv', index=False)
df_test.to_csv('../dataset/testing.csv', index=False)

In [None]:
# lets get the datastore to upload the prepared data
datastore = workspace.get_default_datastore()

In [None]:
## lets upload the local file from src_dir to the target_path in the datastore
datastore.upload(src_dir='../dataset/', target_path='data')

### Feature Selection 

In [None]:
## lets select our target variables 
y_train_full = df_train_full['Future_weather_condition'].values
y_train = df_train['Future_weather_condition'].values
y_valid = df_valid['Future_weather_condition'].values
y_test = df_test['Future_weather_condition'].values

In [None]:
## lets delete the target column from the dataframe
del  df_train_full['Future_weather_condition']
del  df_train['Future_weather_condition']
del  df_valid['Future_weather_condition']
del  df_test['Future_weather_condition']

In [None]:
## lets some of the feature columns
columns = ['Temperature_C', 'Humidity', 'Wind_speed_kmph', 'Wind_bearing_degrees', 
           'Visibility_km','Pressure_millibars', 'Current_weather_condition']

In [None]:
## converting pandas dataframe to numpy array 
X_train_full = df_train_full[columns].values
X_train = df_train[columns].values
X_test = df_test[columns].values
X_valid = df_valid[columns].values

### Feature Scaling And Standardization

In [None]:
## lets create an instance of standard scaler
sc = StandardScaler()

X_train_full = sc.fit_transform(X_train_full) 
X_train = sc.transform(X_train)
X_valid = sc.transform(X_valid)
X_test = sc.transform(X_test)

### Model Training And Hyperparameter Optimization

In [None]:
## lets initiate the training or experiment
myexperiment = Experiment(workspace, "support-vector-machine")

##  lets initiate the mlflow experiment
mlflow.set_experiment("mlflow-support-vector-machine") 

#### Support Vector Machine 

In [None]:
## lets perform a hyperparameter search to find the best parameters

parameters = {'kernel':('linear', 'rbf'), 'C': [1, 10]}

svc = svm.SVC()

## lets initialize a run in Azureml and mlflow experiments
run = myexperiment.start_logging()

#mlflow.start_run()

run.log('dataset name', dataset.name)
run.log('dataset version', dataset.version)
svc_grid = GridSearchCV(svc, parameters)
svc_grid.fit(X_train, y_train)

In [None]:
svc = SVC(C=svc_grid.get_params(deep=True)
          ['estimator__C'], kernel=svc_grid.get_params(deep=True)
           ['estimator__kernel'])
svc.fit(X_train, y_train)

## lets log the training parameters to AzureML and MLFlow experiments
run.log("C", svc_grid.get_params(deep=True)['estimator__C'])
run.log("Kernel", svc_grid.get_params(deep=True)['estimator__kernel'])

### Random Forest Classifier

In [None]:
## lesklearninitiate the training or experiment
myexperiment = Experiment(workspace, "random-forest-classifier")

##  lets initiate the mlflow experiment
mlflow.set_experiment("mlflow-random-forest-classifier") 

In [None]:

## lets create an instance of a random forest classifier
rf = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=100)

In [None]:
## lets initialize runs in Azureml and mlflow
run = myexperiment.start_logging()

#mlflow.start_run()

## lets log the dataset used
run.log('dataset name', dataset.name)
run.log('dataset Version', dataset.version)

## lets fit the data to the random forest alg
rf.fit(X_train, y_train)

## Logging training parameters to AzureML and MLFlow experiments
run.log('max_depth', 10)
run.log('random_state', 0)
run.log('n_estimators', 100)

### Model Evaluation And Testing


#### Support Vector Machine 

In [None]:
## lets generates the predictions for svm classifier
y_valid_pred = svc.predict(X_valid)

## lets compute the accuracy b/n the actual and generated predictions
accScore = accuracy_score(y_valid, y_valid_pred)

## 
f1Score = f1_score(y_valid, y_valid_pred, average="macro")

##
precisionScore = precision_score(y_valid, y_valid_pred, average="macro")

##
recallScore = recall_score(y_valid, y_valid_pred, average="macro")

run.log("Validation Accuracy Score:", accScore)
run.log("Validation f1 Score:", f1Score)
run.log("Validation precision Score:", precisionScore)
run.log("Validation Recall Score:", recallScore)
#run.log("Git-sha", sha)

#### Random Forest Classifier

In [None]:
## lets generates the predictions for svm classifier
y_valid_pred = rf.predict(X_valid)

## lets compute the accuracy b/n the actual and generated predictions
accScore = accuracy_score(y_valid, y_valid_pred)

## 
f1Score = f1_score(y_valid, y_valid_pred, average="macro")

##
precisionScore = precision_score(y_valid, y_valid_pred, average="macro")

##
recallScore = recall_score(y_valid, y_valid_pred, average="macro")

run.log("Validation Accuracy Score:", accScore)
run.log("Validation f1 Score:", f1Score)
run.log("Validation precision Score:", precisionScore)
run.log("Validation Recall Score:", recallScore)
#run.log("Git-sha", sha)

### Model Packaging

In [None]:
initial_type = [('float_input', FloatTensorType([None, 6]))]
onx = convert_sklearn(svc, initial_types=initial_type)

## 
with open("outputs/svc.onnx", "wb") as f:
    f.write(onx.SerializeToString())

In [None]:
initial_type = [('float_input', FloatTensorType([None, 6]))]
onx = convert_sklearn(rf, initial_types=initial_type)

## 
with open("outputs/rf.onnx", "wb") as f:
    f.write(onx.SerializeToString())

#### Registration Models And Production Artifacts

In [None]:
model = Model.register(model_path="outputs/svc.onnx",
                       model_name="Support-vector-classifier", 
                       tags={'dataset': dataset.name, 'version': dataset.version, 'hyparameter-C': '1', 'valid_data-accuracy': '0.9519'}, 
                       model_framework="pandas==0.23.4",
                       description="Support Vector Classifier for predicting weather at port of Turku",
                      workspace=workspace)
print('Name:', model.name)
print('Version:', model.version)

In [None]:
model = Model.register(model_path="outputs/rf.onnx",
                       model_name="random-forest-classifier", 
                       tags={'dataset': dataset.name, 'version': dataset.version, 'hyparameter-C': '1', 'valid_data-accuracy': '0.9519'}, 
                       model_framework="pandas==0.23.4",
                       description="Random Forest Classifier for predicting weather at port of Turku",
                      workspace=workspace)

print('Name:', model.name)
print('Version:', model.version)

#### Registering Production Artifacts

In [None]:
import pickle

with open("./outputs/scaler.pkl", "wb") as scaler_pkl:
    pickle.dump(sc, scaler_pkl)

In [None]:
model = Model.register(model_path="outputs/scaler.pkl",
                       model_name="scaler", 
                       tags={'dataset': dataset.name, 'version': dataset.version}, 
                       model_framework="pandas==0.23.4",
                       description="Scaler used for scaling incoming inference data",
                      workspace=workspace)

print('Name:', model.name)
print('Version:', model.version)