Copyright (c) Microsoft Corporation.

Licensed under the MIT License.


# DISCLAIMER
By accessing this code, you acknowledge that the code is not designed, intended, or made available: (1) as a medical device(s); (2) for the diagnosis of disease or other conditions, or in the cure, mitigation, treatment or prevention of a disease or other conditions; or (3) as a substitute for professional medical advice, diagnosis, treatment, or judgment. Do not use this code to replace, substitute, or provide professional medical advice, diagnosis, treatment, or judgement. You are solely responsible for ensuring the regulatory, legal, and/or contractual compliance of any use of the code, including obtaining any authorizations or consents, and any solution you choose to build that incorporates this code in whole or in part.


# Recommendations 
Load the test data and apply to the model

# Library Imports


In [None]:
import numpy as np
import pandas as pd
from pyspark.sql.types import *


data_lake_account_name = ""
file_system_name = "raw"

subscription_id = "" 
resource_group = "" 
workspace_name = "" 
workspace_region = ""

experiment_name = "DiabetesPredictionExperiment"
autoMLRunId = ""
aks_target_name = ''


In [None]:
# set transformed data schema
transformedSchema = StructType([StructField("race", StringType(), True), 
                    StructField("gender", StringType(), True), 
                    StructField("age", StringType(), True) , 
                    StructField("admission_type_id", StringType(), True), 
                    StructField("discharge_disposition_id", StringType(), True), 
                    StructField("admission_source_id", StringType(), True), 
                    StructField("time_in_hospital", StringType(), True), 
                    StructField("payer_code", StringType(), True), 
                    StructField("num_lab_procedures", StringType(), True), 
                    StructField("num_procedures", StringType(), True), 
                    StructField("num_medications", StringType(), True),
                    StructField("number_outpatient", StringType(), True), 
                    StructField("number_emergency", StringType(), True), 
                    StructField("number_inpatient", StringType(), True), 
                    StructField("number_diagnoses", StringType(), True), 
                    StructField("max_glu_serum", StringType(), True), 
                    StructField("A1Cresult", StringType(), True), 
                    StructField("metformin", StringType(), True), 
                    StructField("repaglinide", StringType(), True), 
                    StructField("nateglinide", StringType(), True), 
                    StructField("chlorpropamide", StringType(), True), 
                    StructField("glimepiride", StringType(), True),
                    StructField("glipizide", StringType(), True), 
                    StructField("glyburide", StringType(), True), 
                    StructField("tolbutamide", StringType(), True), 
                    StructField("pioglitazone", StringType(), True), 
                    StructField("rosiglitazone", StringType(), True), 
                    StructField("acarbose", StringType(), True), 
                    StructField("miglitol", StringType(), True), 
                    StructField("tolazamide", StringType(), True),
                    StructField("insulin", StringType(), True), 
                    StructField("glyburide-metformin", StringType(), True), 
                    StructField("metformin-rosiglitazone", StringType(), True), 
                    StructField("change", StringType(), True), 
                    StructField("diabetesMed", StringType(), True), 
                    StructField("FirstName", StringType(), True), 
                    StructField("LastName", StringType(), True),
                    StructField("Id", StringType(), True), 
                    StructField("spec_InternalMedicine", BooleanType(), True), 
                    StructField("spec_Emergency/Trauma", BooleanType(), True),
                    StructField("spec_Family/GeneralPractice", BooleanType(), True), 
                    StructField("spec_Cardiology", BooleanType(), True), 
                    StructField("spec_Surgery-General", BooleanType(), True), 
                    StructField("diag_428", BooleanType(), True), 
                    StructField("diag_250", BooleanType(), True), 
                    StructField("diag_276", BooleanType(), True), 
                    StructField("diag_414", BooleanType(), True), 
                    StructField("diag_401", BooleanType(), True),  
                    StructField("diag_427", BooleanType(), True), 
                    StructField("diag_599", BooleanType(), True), 
                    StructField("diag_496", BooleanType(), True), 
                    StructField("diag_403", BooleanType(), True), 
                    StructField("diag_486", BooleanType(), True),  
                    StructField("is_readmitted", BooleanType(), True)
                    ])

# Load Data from Azure Data Lake 


In [None]:
from sklearn.model_selection import train_test_split 
import pandas as pd

df_train = spark.read.format("csv").load(f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/DatasetDiabetes/preparedtraindata/",header=True,schema=transformedSchema)
df_train = df_train.toPandas()
outcome_column = 'is_readmitted'

id_column = 'Id'
df_train = df_train.drop(id_column,axis=1)


#%% Split data for validation
X = df_train.drop(outcome_column, axis=1) 
y = df_train[outcome_column] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) 

# Connect to Azure Machine Learning Workspace, Experiment and Load Best Run 



In [None]:
#save the model to a local file
import azureml.core


from azureml.core import Workspace
ws = Workspace(workspace_name = workspace_name,
               subscription_id = subscription_id,
               resource_group = resource_group)
ws.write_config()   

from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl.run import AutoMLRun
from azureml.train.automl import AutoMLConfig

experiment = Experiment(workspace = ws, name = experiment_name)
previous_automl_run = AutoMLRun(experiment, autoMLRunId, outputs = None)
automl_run = previous_automl_run

best_run, fitted_model = automl_run.get_output()


from sklearn.externals import joblib
model_path = 'diabetesmodel'
joblib.dump(fitted_model, model_path)

In [None]:
from azureml.interpret.scoring.scoring_explainer import TreeScoringExplainer, save

from interpret.ext.glassbox import LGBMExplainableModel
from azureml.interpret.mimic_wrapper import MimicWrapper

from azureml.train.automl.runtime.automl_explain_utilities import AutoMLExplainerSetupClass, automl_setup_model_explanations
automl_explainer_setup_obj = automl_setup_model_explanations(fitted_model, X=X_train,
                                                             X_test=X_test, y=y_train,
                                                             task='classification')   

explainer = MimicWrapper(ws, automl_explainer_setup_obj.automl_estimator, LGBMExplainableModel,
                         init_dataset=automl_explainer_setup_obj.X_transform, run=best_run,
                         features=automl_explainer_setup_obj.engineered_feature_names,
                         feature_maps=[automl_explainer_setup_obj.feature_map],
                         classes=automl_explainer_setup_obj.classes)                                                               

#Initialize the ScoringExplainer
scoring_explainer = TreeScoringExplainer(explainer.explainer, feature_maps=[automl_explainer_setup_obj.feature_map])

#Pickle scoring explainer locally
save(scoring_explainer, exist_ok=True)

# Register the Model 


In [None]:
from azureml.core.model import Model

model_name = "diabetesmodel"
registered_model = Model.register(model_path = model_path, # this points to a local file
                       model_name = model_name, # name the model is registered as
                       tags = {'type': "classification"}, 
                       description = "Diabetes Classifier", 
                       workspace = ws)


exp_model_name = "scoring_explainer.pkl"
exp_model_path = "scoring_explainer.pkl"
exp_registered_model = Model.register(model_path = exp_model_path, # this points to a local file
                       model_name = exp_model_name, # name the model is registered as
                       tags = {'type': "scoring explainer"}, 
                       description = "Diabetes Readmission Classifier Explainer", 
                       workspace = ws)                         

In [None]:
def create_additional_features(df): 
    to_drop = ['acetohexamide', 'troglitazone', 'examide', 'citoglipton',
           'glipizide-metformin', 'glimepiride-pioglitazone',
           'metformin-pioglitazone', 'weight', 'patient_nbr', 'encounter_id']
    df.drop(to_drop, axis=1, inplace=True, errors = 'ignore')
    df_transformed = df.replace('?', np.nan) 
    
    spec_counts_raw = {"specs": ['InternalMedicine', 'Emergency/Trauma', 'Family/GeneralPractice','Cardiology',
                       'Surgery-General'], "num patients": [14635,  7565,  7440,  5352,  3099]}

    df_transformed['medical_specialty'] = df_transformed['medical_specialty'].replace(np.nan, "NaNSpec")
    spec_counts = pd.DataFrame(data = spec_counts_raw)
    spec_thresh = 5
    for (index, row) in spec_counts.head(spec_thresh).iterrows():
        spec = row['specs']
        new_col = 'spec_' + str(spec)
        df_transformed[new_col] = (df_transformed.medical_specialty == spec)

    diag_counts_raw = {"icd9value": ['428', '250', '276', '414', '401', '427', '599', '496', '403', '486'],
                    'num patients w diag': [18101., 17861., 13816., 12895., 12371., 11757.,  6824.,  5990.,5693., 5455.]}

    diag_counts = pd.DataFrame(diag_counts_raw, columns = [ 'icd9value', 'num patients w diag'])

    diag_thresh = 10
    for (index, row) in diag_counts.head(diag_thresh).iterrows():
        icd9 = row['icd9value']
        new_col = 'diag_' + str(icd9)
        df_transformed[new_col] = (df_transformed.diag_1 == icd9)|(df_transformed.diag_2 == icd9)|(df_transformed.diag_3 == icd9)

    df_transformed = df_transformed.reset_index(drop=True)

    df_transformed2 = pd.DataFrame(df_transformed, copy=True) #preserve df_transformed so I can rerun this step
    df_transformed2['age'] = df_transformed2.age.str.extract('(\d+)-\d+')

    to_drop = ['acetohexamide', 'troglitazone', 'examide', 'citoglipton',
        'glipizide-metformin', 'glimepiride-pioglitazone',
        'metformin-pioglitazone', 'weight', 'medical_specialty', 'diag_2',
        'diag_1', 'diag_3', 'patient_nbr', 'encounter_id']
    df_transformed2.drop(to_drop, axis=1, inplace=True,errors = 'ignore')

    df_transformed2 = df_transformed2.reset_index(drop=True)

    df_transformed2['readmitted'].value_counts()

    df = pd.DataFrame(df_transformed2)

    #Imputing with outlying value since we are focusing on tree based methods
    df = df.fillna(-9999) 

    df = df.reset_index(drop=True)
    df.dtypes
    
    return df

In [None]:
import pandas as pd

df_test = spark.read.format("csv").load(f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/DatasetDiabetes/preparedtestdata/",header=True,multiLine=True)
df_test = df_test.toPandas()
outcome_column = 'readmitted'
df_test = df_test.drop(outcome_column,axis=1)

df_test = df_test.head(2)
id_column = 'Id'
df_test = df_test.drop(id_column,axis=1)


In [None]:
scoring_script = """
import json
import pickle
import numpy as np
import pandas as pd
import azureml.train.automl
from sklearn.externals import joblib
from azureml.core.model import Model
from azureml.train.automl.runtime.automl_explain_utilities import automl_setup_model_explanations

def create_additional_features(df): 
    to_drop = ['acetohexamide', 'troglitazone', 'examide', 'citoglipton',
           'glipizide-metformin', 'glimepiride-pioglitazone',
           'metformin-pioglitazone', 'weight', 'patient_nbr', 'encounter_id']
    df.drop(to_drop, axis=1, inplace=True, errors = 'ignore')
    df_transformed = df.replace('?', np.nan) 
    
    spec_counts_raw = {"specs": ['InternalMedicine', 'Emergency/Trauma', 'Family/GeneralPractice','Cardiology',
                       'Surgery-General'], "num patients": [14635,  7565,  7440,  5352,  3099]}

    df_transformed['medical_specialty'] = df_transformed['medical_specialty'].replace(np.nan, "NaNSpec")
    spec_counts = pd.DataFrame(data = spec_counts_raw)
    spec_thresh = 5
    for (index, row) in spec_counts.head(spec_thresh).iterrows():
        spec = row['specs']
        new_col = 'spec_' + str(spec)
        df_transformed[new_col] = (df_transformed.medical_specialty == spec)

    diag_counts_raw = {"icd9value": ['428', '250', '276', '414', '401', '427', '599', '496', '403', '486'],
                    'num patients w diag': [18101., 17861., 13816., 12895., 12371., 11757.,  6824.,  5990.,5693., 5455.]}

    diag_counts = pd.DataFrame(diag_counts_raw, columns = [ 'icd9value', 'num patients w diag'])

    diag_thresh = 10
    for (index, row) in diag_counts.head(diag_thresh).iterrows():
        icd9 = row['icd9value']
        new_col = 'diag_' + str(icd9)
        df_transformed[new_col] = (df_transformed.diag_1 == icd9)|(df_transformed.diag_2 == icd9)|(df_transformed.diag_3 == icd9)

    df_transformed = df_transformed.reset_index(drop=True)

    df_transformed2 = pd.DataFrame(df_transformed, copy=True) #preserve df_transformed so I can rerun this step
    df_transformed2['age'] = df_transformed2.age.str.extract('(\d+)-\d+')

    to_drop = ['acetohexamide', 'troglitazone', 'examide', 'citoglipton',
        'glipizide-metformin', 'glimepiride-pioglitazone',
        'metformin-pioglitazone', 'weight', 'medical_specialty', 'diag_2',
        'diag_1', 'diag_3', 'patient_nbr', 'encounter_id']
    df_transformed2.drop(to_drop, axis=1, inplace=True,errors = 'ignore')

    df_transformed2 = df_transformed2.reset_index(drop=True)

    # df_transformed2['readmitted'].value_counts()

    df = pd.DataFrame(df_transformed2)

    #Imputing with outlying value since we are focusing on tree based methods
    df = df.fillna(-9999) 

    df = df.reset_index(drop=True)
    df.dtypes
    
    return df

def init():
    global model
    global scoring_explainer
    # This name is model.id of model that we want to deploy deserialize the model file back
    model_path = Model.get_model_path(model_name = 'diabetesmodel')
    model = joblib.load(model_path)
    
    scoring_explainer_path = Model.get_model_path(model_name = 'scoring_explainer.pkl')
    scoring_explainer = joblib.load(scoring_explainer_path)

def run(input_json):     
    try:
        data_df = pd.read_json(input_json, orient='records').head(1)
        data_df = create_additional_features(data_df)
        stacked_data = pd.DataFrame(data_df.stack().reset_index())
        stacked_data.columns = ['Ind','Column','Value']
        stacked_data = stacked_data[['Column','Value']]
            
        # Get the predictions...
        # prediction = model.predict(data_df)
        prediction = pd.DataFrame(model.predict_proba(data_df),columns=model.y_transformer.inverse_transform(model.classes_)).T.iloc[0,0]
        prediction = np.round(prediction * 100,2)

        automl_explainer_setup_obj = automl_setup_model_explanations(model,X_test=data_df, task='classification')
        raw_local_importance_values = scoring_explainer.explain(automl_explainer_setup_obj.X_test_transform, get_raw=True)

        stacked_data['raw_imp'] = raw_local_importance_values[0]
        stacked_data = stacked_data.sort_values('raw_imp',ascending = False).head(10)
        #stacked_data['raw_imp'] = stacked_data['raw_imp'] * 100
        stacked_data = stacked_data.round(2)
    except Exception as e:
        prediction = np.array([str(e)])
        stacked_data = pd.DataFrame([str(e)])
 
    return {'predictions': prediction.tolist(),
            'raw_local_importance_values': stacked_data.values.tolist()}
"""
exec(scoring_script)
with open("scoring_script.py", "w") as file:
    file.write(scoring_script)
    
scoring_script_file_name = 'scoring_script.py'

In [None]:
json_test_data = df_test.head(1).to_json(orient='records')
print(json_test_data)
init()
run(json_test_data)

In [None]:
# obtain conda dependencies from the automl run and save the file locally
from azureml.core import Environment
from azureml.core.environment import CondaDependencies
conda_dep = CondaDependencies()
environment_config_file = 'diabetes_conda_env.yml'
best_run.download_file('outputs/conda_env_v_1_0_0.yml', environment_config_file)
with open('diabetes_conda_env.yml', 'r') as f:
    print(f.read())

# create the environment based on the saved conda dependencies file
myenv = Environment.from_conda_specification(name="diabetesenv", file_path=environment_config_file)
conda_dep.add_pip_package("shap==0.35.0")
conda_dep.add_pip_package("azureml-train-automl-runtime==1.32.0")
conda_dep.add_pip_package("inference-schema")
conda_dep.add_pip_package("azureml-interpret==1.32.0")
conda_dep.add_pip_package("azureml-defaults==1.32.0")
conda_dep.add_conda_package("numpy>=1.16.0,<1.19.0")
conda_dep.add_conda_package("pandas==0.25.1")
conda_dep.add_conda_package("scikit-learn==0.22.1")
conda_dep.add_conda_package("py-xgboost<=0.90")
conda_dep.add_conda_package("fbprophet==0.5")
conda_dep.add_conda_package("holidays==0.9.11")
conda_dep.add_conda_package("psutil>=5.2.2,<6.0.0")
myenv.python.conda_dependencies=conda_dep
myenv.register(workspace=ws)

# Deploy Model to AKS Cluster 


In [None]:
from azureml.core.compute import AksCompute, ComputeTarget
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AksWebservice
from azureml.core.webservice import Webservice

# Configure and deploy the web service to Azure Container Instances
inference_config = InferenceConfig(environment=myenv, entry_script=scoring_script_file_name)
aks_config = AksWebservice.deploy_configuration(cpu_cores = 1, memory_gb= 2, tags = { 'type' : 'automl-classification'}, description='AutoML Diabetes Readmission Classifier Service')
aks_service_name = 'diabetes-readmission-service-aks'
aks_target = AksCompute(ws,aks_target_name)
aks_service = Model.deploy(ws, aks_service_name, [exp_registered_model,registered_model], inference_config, aks_config, aks_target)


In [None]:
aks_service.wait_for_deployment(show_output = True)
print(aks_service.state)

In [None]:
json_test_data = df_test.head(1).to_json()
aks_service.run(json_test_data)