In [None]:
!pip install --upgrade azureml-core

In [None]:
!pip install --upgrade scikit-learn==0.24.2

In [None]:
!pip install --upgrade imbalanced-learn==0.8.0

In [None]:
!pip install --upgrade seaborn

In [None]:
!pip install --upgrade azureml-interpret

In [None]:
!pip install interpret-community[visualization]

In [None]:
!pip install --upgrade azureml-mlflow

In [1]:
import azureml.core


print(f"Azure ML SDK Version: {azureml.core.VERSION}")

Azure ML SDK Version: 1.30.0


In [2]:
from azureml.core import Workspace


ws = Workspace.from_config()
ws

Workspace.create(name='sandboxaml', subscription_id='f80606e5-788f-4dc3-a9ea-2eb9a7836082', resource_group='rg-sandbox')

In [3]:
import sklearn


print(f"Azure ML SDK Version: {sklearn.__version__}")

Azure ML SDK Version: 0.24.2


In [4]:
experiment_name = 'exp_german_credit'

In [5]:
from azureml.core import Experiment


exp = Experiment(workspace=ws, name=experiment_name)
exp

Name,Workspace,Report Page,Docs Page
exp_german_credit,sandboxaml,Link to Azure Machine Learning studio,Link to Documentation


In [6]:
compute_target_name = "compute-cluster"

In [7]:
# Compute target creation

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException


# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=compute_target_name)
    print(" Cluster already exists")
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           min_nodes=0, max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True, min_node_count=0, timeout_in_minutes=30)

 Cluster already exists
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [8]:
# Retrieve existing compute target

from azureml.core.compute import ComputeTarget


compute_target = ComputeTarget(workspace=ws, name=compute_target_name)
print(compute_target.provisioning_state)

Succeeded


# Remote training

In [None]:
%%writefile scripts/train.py

import argparse
import os
import numpy as np
import glob

import sys
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix,  plot_roc_curve, f1_score, recall_score
from sklearn import preprocessing
#from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE
import joblib
#import pickle

from azureml.core import Run
from azureml.core import Dataset
#from utils import load_data

# get hold of the current run
run = Run.get_context()
exp = run.experiment
ws = run.experiment.workspace

parser = argparse.ArgumentParser()
parser.add_argument('--penalty', type=str, default='l2', help='norm')
parser.add_argument('--max_iter', type=int, default=10000, help='iterations')
args = parser.parse_args()

print("Argument 1: %s" % args.penalty)
print("Argument 2: %s" % args.max_iter)

dataset_name = 'German Credit'
dataset = Dataset.get_by_name(ws, name=dataset_name)
df = dataset.to_pandas_dataframe()

#df.drop("id", axis=1, inplace=True) # d'où vient la variable id ? du dataset Azure ML ?
df["class"] = [1 if x == "good" else 0 for x in df["class"]]

credit_train, credit_test = train_test_split(df, test_size=0.2)
Y_train = credit_train["class"]
credit_train.drop("class", axis=1, inplace=True)

Y_test = credit_test["class"]
credit_test.drop("class", axis=1, inplace=True)

le = preprocessing.LabelEncoder()
X_train = pd.DataFrame()
X_test = pd.DataFrame()

for c in credit_train:
    X_train[c] = le.fit_transform(credit_train[c])
    X_test[c] = le.fit_transform(credit_test[c])
    
# Basic logistic regression
model = LogisticRegression(penalty='l2', max_iter=10000).fit(X=X_train, y=Y_train)
print("Train accuracy : {}".format(model.score(X_train, Y_train)))
print("Test accuracy : {}".format(model.score(X_test, Y_test)))

plot_confusion_matrix(model, X_test, Y_test, cmap="binary")

#plot_roc_curve(model, X_test, Y_test)
#plt.show()

print("Recall : {}".format(recall_score(model.predict(X_test), Y_test)))
print("F1 : {}".format(f1_score(model.predict(X_test), Y_test)))

os.makedirs('outputs', exist_ok=True)
# note file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=model, filename='outputs/german_credit_log_remote_model.pkl')

In [9]:
%%writefile scripts/train.py

import argparse
import os
import numpy as np
import glob

import sys
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix,  plot_roc_curve, f1_score, recall_score
from sklearn import preprocessing
#from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE
import joblib
#import pickle

import mlflow
import mlflow.sklearn

from azureml.core import Run
from azureml.core import Dataset
#from utils import load_data

# get hold of the current run
run = Run.get_context()
exp = run.experiment
ws = run.experiment.workspace

parser = argparse.ArgumentParser()
parser.add_argument('--penalty', type=str, default='l2', help='norm')
parser.add_argument('--max_iter', type=int, default=10000, help='iterations')
args = parser.parse_args()

print("Argument 1: %s" % args.penalty)
print("Argument 2: %s" % args.max_iter)

dataset_name = 'German Credit'
dataset = Dataset.get_by_name(ws, name=dataset_name)
df = dataset.to_pandas_dataframe()

#df.drop("id", axis=1, inplace=True) # d'où vient la variable id ? du dataset Azure ML ?
df["class"] = [1 if x == "good" else 0 for x in df["class"]]

credit_train, credit_test = train_test_split(df, test_size=0.2)
Y_train = credit_train["class"]
credit_train.drop("class", axis=1, inplace=True)

Y_test = credit_test["class"]
credit_test.drop("class", axis=1, inplace=True)

le = preprocessing.LabelEncoder()
X_train = pd.DataFrame()
X_test = pd.DataFrame()

for c in credit_train:
    X_train[c] = le.fit_transform(credit_train[c])
    X_test[c] = le.fit_transform(credit_test[c])

mlflow_uri = ws.get_mlflow_tracking_uri()
print(f"MLflow URI : {mlflow_uri}")
mlflow.set_tracking_uri(mlflow_uri)
mlflow.set_experiment(exp.name)

with mlflow.start_run():
    
    mlflow.log_param("penalty", args.penalty)
    mlflow.log_param("max_iter", args.max_iter)

    model = LogisticRegression(penalty='l2', max_iter=10000).fit(X=X_train, y=Y_train)
    
    train_accuracy = model.score(X_train, Y_train)
    test_accuracy = model.score(X_test, Y_test)
    mlflow.log_metric("train accuracy", train_accuracy)
    mlflow.log_metric("test accuracy", test_accuracy)
    print("Train accuracy : {}".format(train_accuracy))
    print("Test accuracy : {}".format(test_accuracy))

    cm_image = plot_confusion_matrix(model, X_test, Y_test, cmap="binary")
    rc_image = plot_roc_curve(model, X_test, Y_test)
    
    # Log artifacts (output files)
    #mlflow.log_artifact("confusion_matrix.png")
    #mlflow.log_artifact("roc_curve.png")

    recall = recall_score(model.predict(X_test), Y_test)
    f1 = f1_score(model.predict(X_test), Y_test)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)
    print("Recall : {}".format(recall))
    print("F1 : {}".format(f1))
    
    # NE CREE PAS D'ARTEFACT ? il faudrait utiliser model.register ou run.register_model
    mlflow.sklearn.log_model(model, "german_credit_mlflow_log_model")
    
    # ENREGISTRE LE MODELE EN LOCAL
    modelpath = f"german_credit/log_model-{args.penalty}-{args.max_iter}.pkl"
    mlflow.sklearn.save_model(model, modelpath)
    
    azureml_model = run.register_model(model_name='german_credit_mlflow_log_model',
                                              model_path="german_credit_mlflow_log_model/model.pkl",
                                              tags={},
                                              description="Model saved with MLflow sklearn")

Overwriting scripts/train.py


https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/media/how-to-track-experiments/mlflow-diagram-track.png

In [None]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies


trainenv = Environment('german-credit-training-env')
trainenv.python.conda_dependencies = CondaDependencies.create(pip_packages=[
    'azureml-defaults',
    'inference-schema[numpy-support]',
    'joblib',
    'numpy',
    'pandas',
    'sklearn',
    'mlflow',
    'matplotlib',
    'seaborn'
])

# https://azure.github.io/azureml-cheatsheets/docs/cheatsheets/python/v1/environment/
# add pip packages
#conda.add_pip_package('pickle')
#conda.add_pip_package('collections')

trainenv.save_to_directory('environment/trainenv.yml', overwrite=True)



Conda channels : https://docs.anaconda.com/anaconda/user-guide/tasks/using-repositories/


In [10]:
%%writefile scripts/trainenv.yml

name: trainenv
channels:
    - anaconda
    - conda-forge
dependencies:
    - python=3.6.9
    - pip
    - pip:
        - azureml-core
        - azureml-defaults
        - azureml-mlflow
        - opencensus-ext-azure>=1.0.1
        - inference-schema[numpy-support]
        - joblib
        - numpy
        - pandas
        - scikit-learn==0.24.1
        - imbalanced-learn==0.8.0
        - mlflow
        - matplotlib
        - seaborn

Overwriting scripts/trainenv.yml


In [11]:
from azureml.core import Environment


trainenv = Environment.from_conda_specification('trainenv', 'scripts/trainenv.yml')

In [12]:
script_folder = 'scripts'

In [13]:
from azureml.train.estimator import Estimator


script_params = {
    '--penalty': 'l2',
    '--max_iter': 10000
}

estimator = Estimator(source_directory=script_folder,
              script_params=script_params,
              compute_target=compute_target_name,
              environment_definition=trainenv,
              entry_script='train.py')

'Estimator' is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or an Azure ML curated environment.


In [14]:
#ScriptRunConfig
# https://docs.microsoft.com/fr-fr/python/api/azureml-core/azureml.core.scriptrunconfig?view=azure-ml-py
# https://docs.microsoft.com/fr-fr/azure/machine-learning/how-to-migrate-from-estimators-to-scriptrunconfig
from azureml.core import ScriptRunConfig


config = ScriptRunConfig(source_directory=script_folder,
                        script='train.py',
                        arguments=['--penalty', 'l2', '--max_iter', 10000],
                        compute_target=compute_target_name,
                        environment=trainenv)

In [15]:
#run = exp.submit(config=estimator)
run = exp.submit(config=config)
run

Experiment,Id,Type,Status,Details Page,Docs Page
exp_german_credit,exp_german_credit_1623184873_8799d9fa,azureml.scriptrun,Preparing,Link to Azure Machine Learning studio,Link to Documentation


In [16]:
from azureml.widgets import RunDetails

RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [17]:
# specify show_output to True for a verbose log
run.wait_for_completion(show_output=True)

RunId: exp_german_credit_1623184873_8799d9fa
Web View: https://ml.azure.com/runs/exp_german_credit_1623184873_8799d9fa?wsid=/subscriptions/f80606e5-788f-4dc3-a9ea-2eb9a7836082/resourcegroups/rg-sandbox/workspaces/sandboxaml&tid=8e2e7c2d-4702-496d-af6c-96e4bfc9f667

Streaming azureml-logs/65_job_prep-tvmps_5fc14c69b009f20fa8e89b42d0d27c21057a0a892d64ddba758ffa270043cc95_d.txt

[2021-06-08T20:45:50.194346] Entering job preparation.
[2021-06-08T20:45:51.619326] Starting job preparation.
[2021-06-08T20:45:51.619380] Extracting the control code.
[2021-06-08T20:45:51.640339] fetching and extracting the control code on master node.
[2021-06-08T20:45:51.640375] Starting extract_project.
[2021-06-08T20:45:51.640418] Starting to extract zip file.
[2021-06-08T20:45:52.166462] Finished extracting zip file.
[2021-06-08T20:45:52.346078] Using urllib.request Python 3.0 or later
[2021-06-08T20:45:52.346151] Start fetching snapshots.
[2021-06-08T20:45:52.346199] Start fetching snapshot.
[2021-06-08T20:

{'runId': 'exp_german_credit_1623184873_8799d9fa',
 'target': 'compute-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-06-08T20:45:24.171666Z',
 'endTimeUtc': '2021-06-08T20:46:34.117923Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'fd9f762a-7f59-4074-aff6-df2d87166183',
  'azureml.git.repository_uri': 'methodidacte@vs-ssh.visualstudio.com:v3/methodidacte/azuremlnotebooks/azuremlnotebooks',
  'mlflow.source.git.repoURL': 'methodidacte@vs-ssh.visualstudio.com:v3/methodidacte/azuremlnotebooks/azuremlnotebooks',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.commit': '175da9d421962e1b3971a618aea8a1f856c3f8c5',
  'mlflow.source.git.commit': '175da9d421962e1b3971a618aea8a1f856c3f8c5',
  'azureml.git.dirty': 'True',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json',
  'mlflow.param.key.penalty': 'l2',
  'mlflow.param.key.max_iter': '10000'

Warning: you have pip-installed dependencies in your environment file, but you do not list pip itself as one of your conda dependencies.  Conda may not use the correct pip to install your packages, and they may end up in the wrong place.  Please add an explicit pip dependency.  I'm adding one for you, but still nagging you.

==> WARNING: A newer version of conda exists. <==
  current version: 4.9.2
  latest version: 4.10.1

Please update conda by running

    $ conda update -n base -c defaults conda

In [None]:
print(run.get_file_names())

https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/media/how-to-track-experiments/mlflow-diagram-track.png

# Inference

In [None]:
%%writefile scripts/score.py

import joblib
import numpy as np
import os
import json
import sklearn

from inference_schema.schema_decorators import input_schema, output_schema
from inference_schema.parameter_types.numpy_parameter_type import NumpyParameterType


# The init() method is called once, when the web service starts up.
#
# Typically you would deserialize the model file, as shown here using joblib,
# and store it in a global variable so your run() method can access it later.
def init():
    global model

    # The AZUREML_MODEL_DIR environment variable indicates
    # a directory containing the model file you registered.
    model_filename = 'german_credit_log_model.pkl'
    model_path = os.path.join(os.environ['AZUREML_MODEL_DIR'], model_filename)
    #ModuleNotFoundError: No module named 'sklearn.externals.joblib'
    model = joblib.load(model_path)


# The run() method is called each time a request is made to the scoring API.
#
# Shown here are the optional input_schema and output_schema decorators
# from the inference-schema pip package. Using these decorators on your
# run() method parses and validates the incoming payload against
# the example input you provide here. This will also generate a Swagger
# API document for your web service.
#@input_schema('data', NumpyParameterType(np.array([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])))
#@output_schema(NumpyParameterType(np.array([0])))

def run(raw_data):
    data = json.loads(raw_data)['data']
    method = json.loads(raw_data)['method']
    # Use the model object loaded by init().
    result = model.predict(data) if method=="predict" else model.predict_proba(data)

    # You can return any JSON-serializable object.
    return result.tolist()

In [None]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies


environment = Environment('german-credit-deploy-env')
environment.python.conda_dependencies = CondaDependencies.create(pip_packages=[
    'azureml-defaults',
    'inference-schema[numpy-support]',
    'joblib',
    'numpy',
    'sklearn'
])

environment.save_to_directory('environment/infenv.yml', overwrite=True)

In [None]:
%%writefile scripts/infenv.yml

name: project_environment
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2

- pip:
  - azureml-defaults
  - inference-schema[numpy-support]
  - joblib
  - numpy
  - sklearn
channels:
- anaconda
- conda-forge


In [None]:
from azureml.core.model import InferenceConfig


inference_config = InferenceConfig(entry_script='score.py', environment=environment)

In [None]:
service_name = 'german-credit-custom-srv'

In [None]:
from azureml.core import Webservice
from azureml.core.webservice import AciWebservice
from azureml.exceptions import WebserviceException


# Remove any existing service under the same name.
try:
    Webservice(ws, service_name).delete()
except WebserviceException:
    pass

aci_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1, auth_enabled=False)

service = Model.deploy(workspace=ws,
                       name=service_name,
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=aci_config)

service.wait_for_deployment(show_output=True)

In [None]:
print(service.state)

In [None]:
print(service.get_logs())

In [None]:
# Seulement si auth_enabled=True
#print(service.get_keys())

In [None]:
print(service.swagger_uri)

In [None]:
import json


input_payload = json.dumps({ 
    "data": [
        [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
        #,['0<=X<200',24,'existing paid','radio/tv',5951,'<100','1<=X<4',2,'female div/dep/mar','none',2,'real estate',22,'none','own',1,'skilled',1,'none','yes']
    ],
    "method": "predict"  # If you have a classification model, you can get probabilities by changing this to 'predict_proba'.
})

output = service.run(input_payload)

print(output)


In [None]:
import json


input_payload = json.dumps({ 
    "data": [
        [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
        #,['0<=X<200',24,'existing paid','radio/tv',5951,'<100','1<=X<4',2,'female div/dep/mar','none',2,'real estate',22,'none','own',1,'skilled',1,'none','yes']
    ],
    "method": "predict_proba"  # If you have a classification model, you can get probabilities by changing this to 'predict_proba'.
})

output = service.run(input_payload)

print(output)


In [None]:
import requests


#input_data = "{\"data\": [[\"0<=X<200\",48,\"existing paid\",\"radio/tv\",5951,\"<100\",\"1<=X<4\",2,\"female div/dep/mar\",\"none\",2,\"real estate\",22,\"none\",\"own\",1,\"skilled\",1,\"none\",\"yes\"]]}"
input_data = "{\"data\": [[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]}"

headers = {'Content-Type':'application/json'}

# for AKS deployment you'd need to the service key in the header as well
api_key = service.get_keys()[0]
headers = {'Content-Type':'application/json',  'Authorization':('Bearer '+ api_key)} 

resp = requests.post(service.scoring_uri, input_data, headers=headers)

print("POST to url", service.scoring_uri)
print("prediction:", resp.text)

In [None]:
#service.delete()

# Interpretability

https://github.com/MicrosoftDocs/azure-docs/blob/master/articles/machine-learning/how-to-machine-learning-interpretability-aml.md

#### remote run

In [18]:
# importance des caractéristiques traitées à partir de la meilleure exécution
from azureml.interpret import ExplanationClient

client = ExplanationClient.from_run(run)
engineered_explanations = client.download_model_explanation(raw=False)
#print(engineered_explanations.get_feature_importance_dict())

engineered_dict = engineered_explanations.get_feature_importance_dict()
engineered_df = pd.DataFrame(engineered_dict.items(), columns=['Engineered_feature', 'Value'])

ImportError: cannot import name 'ExplanationClient'

In [None]:
engineered_df.head(20)

In [None]:
# importance des caractéristiques brutes à partir de la meilleure exécution 
from azureml.interpret import ExplanationClient

client = ExplanationClient.from_run(run)
raw_explanations = client.download_model_explanation(raw=True)
print(raw_explanations.get_feature_importance_dict())

raw_dict = raw_explanations.get_feature_importance_dict()
raw_df = pd.DataFrame(raw_dict.items(), columns=['Raw_feature', 'Value'])

#Explanation asset ID None was not found to match the supplied filters ['comment', 'raw'].

In [None]:
raw_df[raw_df['Value']>0]

In [None]:
from azureml.train.automl.runtime.automl_explain_utilities import automl_setup_model_explanations

X_train = train_data.to_pandas_dataframe().drop(target, axis=1)
y_train = train_data.to_pandas_dataframe()[[target]]

automl_explainer_setup_obj = automl_setup_model_explanations(fitted_model, X=X_train, 
                                                             X_test=X_test.drop([target, 'score', 'gap', 'gap w/sign'], axis=1), y=y_test,
                                                             task='classification')

In [None]:
from azureml.interpret import MimicWrapper

# Initialize the Mimic Explainer
explainer = MimicWrapper(ws, automl_explainer_setup_obj.automl_estimator,
                         explainable_model=automl_explainer_setup_obj.surrogate_model, 
                         init_dataset=automl_explainer_setup_obj.X_transform, run=automl_run,
                         features=automl_explainer_setup_obj.engineered_feature_names, 
                         feature_maps=[automl_explainer_setup_obj.feature_map],
                         model_task='regression',
                         #classes=automl_explainer_setup_obj.classes,
                         explainer_kwargs=automl_explainer_setup_obj.surrogate_model_params)

In [None]:
engineered_explanations = explainer.explain(['local', 'global'], eval_dataset=automl_explainer_setup_obj.X_test_transform)
print(engineered_explanations.get_feature_importance_dict())

In [None]:
from interpret_community.widget import ExplanationDashboard

ExplanationDashboard(raw_explanations, automl_explainer_setup_obj.automl_pipeline, datasetX=automl_explainer_setup_obj.X_test_raw)

In [None]:
from azureml.interpret.scoring.scoring_explainer import TreeScoringExplainer, save

# Initialize the ScoringExplainer
scoring_explainer = TreeScoringExplainer(explainer.explainer, feature_maps=[automl_explainer_setup_obj.feature_map])

# Pickle scoring explainer locally
save(scoring_explainer, exist_ok=True)

# Register trained automl model present in the 'outputs' folder in the artifacts
original_model = automl_run.register_model(model_name='automl_model', 
                                           model_path='outputs/model.pkl')

# Register scoring explainer
automl_run.upload_file('scoring_explainer.pkl', 'scoring_explainer.pkl')
scoring_explainer_model = automl_run.register_model(model_name='scoring_explainer', model_path='scoring_explainer.pkl')