In [2]:
from azureml.core.model import Model
import azureml.core
import os
from azureml.core import Workspace

subscription_id = os.getenv("SUBSCRIPTION_ID", default="a6c2a7cc-d67e-4a1a-b765-983f08c0423a")
resource_group = os.getenv("RESOURCE_GROUP", default="xiaoyzhu-mlworkspace")
workspace_name = os.getenv("WORKSPACE_NAME", default="xiaoyzhu-MLworkspace")
workspace_region = os.getenv("WORKSPACE_REGION", default="eastus2")


try:
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
    # write the details of the workspace to a configuration file to the notebook library
    ws.write_config(file_name="xiaoyzhuconfig.json")
    print("Workspace configuration succeeded. Skip the workspace creation steps below")
except:
    print("Workspace not accessible. Change your parameters or create a new workspace below")
model = Model.register(model_path = r"F:\PatientHub\MLModels\DiabetesReadmission\data\model.pkl",
                       model_name = "Mymodel",
                       tags = {"key": "0.1"},
                       description = "test",
                       workspace = ws)
model.list(workspace = ws, name = "Mymodel")

Falling back to use azure cli credentials. This fall back to use azure cli credentials will be removed in the next release. 
Make sure your code doesn't require 'az login' to have happened before using azureml-sdk, except the case when you are specifying AzureCliAuthentication in azureml-sdk.


Wrote the config file xiaoyzhuconfig.json to: F:\PatientHub\MLModels\Diabetes\aml_config\xiaoyzhuconfig.json
Workspace configuration succeeded. Skip the workspace creation steps below
Registering model Mymodel


[<azureml.core.model.Model at 0x2d3520a16d8>]

In [2]:
import numpy as np

# Read in the data

In [3]:
df_raw = pd.read_csv(datain_dir + 'diabetic_data.csv')
list(df_raw.columns)

['encounter_id',
 'patient_nbr',
 'race',
 'gender',
 'age',
 'weight',
 'admission_type_id',
 'discharge_disposition_id',
 'admission_source_id',
 'time_in_hospital',
 'payer_code',
 'medical_specialty',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'diag_1',
 'diag_2',
 'diag_3',
 'number_diagnoses',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 'change',
 'diabetesMed',
 'readmitted']

In [4]:
df_raw = pd.read_csv(datain_dir + 'diabetic_data.csv')
to_drop = ['acetohexamide', 'troglitazone', 'examide', 'citoglipton',
       'glipizide-metformin', 'glimepiride-pioglitazone',
       'metformin-pioglitazone', 'weight', 'patient_nbr', 'encounter_id']
df_raw.drop(to_drop, axis=1, inplace=True, errors = 'ignore')
df_raw = df_raw.replace('?', np.nan) 
print(df_raw.shape)

(101766, 40)


# Generate boolean features for only most common medical specialties

In [5]:
spec_counts_raw = {"specs": ['InternalMedicine', 'Emergency/Trauma', 'Family/GeneralPractice',
       'Cardiology', 'Surgery-General'], "num patients": [14635,  7565,  7440,  5352,  3099]}

spec_counts = pd.DataFrame(spec_counts_raw, columns = ['specs', "num patients"]).set_index(["specs"])
spec_thresh = 5
for (spec, count) in spec_counts.head(spec_thresh).iteritems():
    new_col = 'spec_' + str(spec)
    df_raw[new_col] = (df_raw.medical_specialty == spec)
print(df_raw.shape)
    
# df_raw.filter(regex='spec').sample(10)

(101766, 41)


# Generate boolean features for top N diagnoses

In [6]:
diag_counts_raw = {"icd9value": ['428', '250', '276', '414', '401', '427', '599', '496', '403', '486'], 'num patients w diag': [18101., 17861., 13816., 12895., 12371., 11757.,  6824.,  5990.,
        5693.,  5455.]}

diag_counts = pd.DataFrame(diag_counts_raw, columns = [ 'icd9value', 'num patients w diag']).set_index(['icd9value'])

diag_thresh = 10
for (icd9, count) in diag_counts.head(diag_thresh).iteritems():
    new_col = 'diag_' + str(icd9)
    df_raw[new_col] = (df_raw.diag_1 == icd9)|(df_raw.diag_2 == icd9)|(df_raw.diag_3 == icd9)
    
print(df_raw.shape)

(101766, 42)


# Clean the data

In [7]:
df_raw2 = pd.DataFrame(df_raw, copy=True) #preserve df_raw so I can rerun this step
df_raw2['age'] = df_raw2.age.str.extract('(\d+)-\d+')

to_drop = ['acetohexamide', 'troglitazone', 'examide', 'citoglipton',
       'glipizide-metformin', 'glimepiride-pioglitazone',
       'metformin-pioglitazone', 'weight', 'medical_specialty', 'diag_2',
       'diag_1', 'diag_3', 'patient_nbr', 'encounter_id']
df_raw2.drop(to_drop, axis=1, inplace=True,errors = 'ignore')
print(df_raw2.shape)
#break out categorical variables into binaries
cat_cols = ['gender', 'tolbutamide', 'acarbose', 'miglitol', 'tolazamide',
       'metformin-rosiglitazone', 'change', 'diabetesMed',
       'glyburide-metformin', 'readmitted', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'glipizide', 'glyburide', 'pioglitazone',
       'rosiglitazone', 'insulin', 'race', 'admission_type_id',
       'admission_source_id', 'payer_code', 'discharge_disposition_id']
df_raw2 = pd.get_dummies(df_raw2, columns=cat_cols)

#dropping these leaves up with one binary variable, ideal for simplicity
df_raw2.drop(['readmitted_<30','readmitted_>30'], axis=1, inplace=True)

#cleaning up outcome variable
df_raw2['is_readmitted'] = (df_raw2.readmitted_NO == 0)
df_raw2.drop('readmitted_NO', axis=1, inplace=True)

#ta daaaaaah, the data is ready to go
df = pd.DataFrame(df_raw2)
list(df.columns)

(101766, 38)


['age',
 'time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses',
 'spec_num patients',
 'diag_num patients w diag',
 'gender_Female',
 'gender_Male',
 'gender_Unknown/Invalid',
 'tolbutamide_No',
 'tolbutamide_Steady',
 'acarbose_Down',
 'acarbose_No',
 'acarbose_Steady',
 'acarbose_Up',
 'miglitol_Down',
 'miglitol_No',
 'miglitol_Steady',
 'miglitol_Up',
 'tolazamide_No',
 'tolazamide_Steady',
 'tolazamide_Up',
 'metformin-rosiglitazone_No',
 'metformin-rosiglitazone_Steady',
 'change_Ch',
 'change_No',
 'diabetesMed_No',
 'diabetesMed_Yes',
 'glyburide-metformin_Down',
 'glyburide-metformin_No',
 'glyburide-metformin_Steady',
 'glyburide-metformin_Up',
 'max_glu_serum_>200',
 'max_glu_serum_>300',
 'max_glu_serum_None',
 'max_glu_serum_Norm',
 'A1Cresult_>7',
 'A1Cresult_>8',
 'A1Cresult_None',
 'A1Cresult_Norm',
 'metformin_Down',
 'metformin_No',
 'metformin_Steady',
 'me

# Define this machine learning problem, impute, set aside test data

In [8]:
#partition training and test data, one balanced training set, all remaining for testing 
outcome_column = 'is_readmitted' 

#Imputing with outlying value since we are focusing on tree based methods
dff = df.fillna(-9999) 

#%% Split data for validation
X = dff.drop(outcome_column, axis=1) 
y = dff[outcome_column] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) 
y_train_array = np.array(y_train.values.tolist())

# AML Fit data to Random Forest model, trying different subsets of variables


In [9]:
set_diagnostics_collection(send_diagnostics = True)

Turning diagnostics collection on. 


In [10]:
import azureml.core
import os
from azureml.core import Workspace

subscription_id = os.getenv("SUBSCRIPTION_ID", default="a6c2a7cc-d67e-4a1a-b765-983f08c0423a")
resource_group = os.getenv("RESOURCE_GROUP", default="xiaoyzhu-mlworkspace")
workspace_name = os.getenv("WORKSPACE_NAME", default="xiaoyzhu-MLworkspace")
workspace_region = os.getenv("WORKSPACE_REGION", default="eastus2")


try:
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
    # write the details of the workspace to a configuration file to the notebook library
    ws.write_config(file_name="xiaoyzhuconfig.json")
    print("Workspace configuration succeeded. Skip the workspace creation steps below")
except:
    print("Workspace not accessible. Change your parameters or create a new workspace below")
import logging
experiment = Experiment(ws, "Diabetes_prediction")
automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors.log',
                             primary_metric = 'AUC_weighted',
                             iteration_timeout_minutes = 60,
                             iterations = 10,
                             n_cross_validations = 3,
                             max_concurrent_iterations = multiprocessing.cpu_count(),
                             verbosity = logging.INFO,
                             X = X_train, 
                             preprocess = True,
                             y = y_train_array,
                             model_explainability = False,
                             path = './')

Falling back to use azure cli credentials. This fall back to use azure cli credentials will be removed in the next release. 
Make sure your code doesn't require 'az login' to have happened before using azureml-sdk, except the case when you are specifying AzureCliAuthentication in azureml-sdk.


Wrote the config file xiaoyzhuconfig.json to: F:\PatientHub\MLModels\Diabetes\aml_config\xiaoyzhuconfig.json
Workspace configuration succeeded. Skip the workspace creation steps below


In [11]:
local_run = experiment.submit(automl_config, show_output = True)

Running on local machine
Parent Run ID: AutoML_63800cb8-2592-4e6a-8b40-760526fd7585
*******************************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
TRAINFRAC: Fraction of the training data to train on.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
*******************************************************************************************************************

 ITERATION   PIPELINE                                       TRAINFRAC  DURATION      METRIC      BEST
         0   SparseNormalizer LightGBM                      1.0000     0:00:13       0.6594    0.6594
         1   SparseNormalizer LightGBM                      1.0000     0:00:37       0.6774    0.6774
         2   StandardScalerWrapper LightGBM                 1.0000

In [None]:
from azureml.widgets import RunDetails
RunDetails(local_run).show()

In [None]:
children = list(local_run.get_children())
metricslist = {}
for run in children:
    properties = run.get_properties()
    metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}
    metricslist[int(properties['iteration'])] = metrics

rundata = pd.DataFrame(metricslist).sort_index(1)
rundata

In [None]:
best_run, fitted_model = local_run.get_output()
print(best_run)
print(fitted_model)

In [None]:
fitted_model.predict_proba(X=X_test)

In [None]:
X_test[1:2]

# Register the model for deployment

In [None]:
description = 'AutoML Model'
tags = None
model = local_run.register_model(description = description, tags = tags)

print(local_run.model_id) # This will be written to the script file later in the notebook.

In [None]:
%%writefile score.py
import pickle
import json
import numpy
import azureml.train.automl
from sklearn.externals import joblib
from azureml.core.model import Model
import pandas as pd
import numpy as np


def init():
    global model
    model_path = Model.get_model_path(model_name = '<<modelid>>') # this name is model.id of model that we want to deploy
    # deserialize the model file back into a sklearn model
    model = joblib.load(model_path)

def run(rawdata):
    try:
        data = json.loads(rawdata)['data']
        print("data is", data, "len is", len(data))
        df_raw = pd.DataFrame(data=data, columns=['race', 'gender', 'age', 'weight',
        'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
        'time_in_hospital', 'payer_code', 'medical_specialty',
        'num_lab_procedures', 'num_procedures', 'num_medications',
        'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
        'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
        'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
        'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
        'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
        'tolazamide', 'examide', 'citoglipton', 'insulin',
        'glyburide-metformin', 'glipizide-metformin',
        'glimepiride-pioglitazone', 'metformin-rosiglitazone',
        'metformin-pioglitazone', 'change', 'diabetesMed'])
        to_drop = ['acetohexamide', 'troglitazone', 'examide', 'citoglipton',
        'glipizide-metformin', 'glimepiride-pioglitazone',
        'metformin-pioglitazone', 'weight', 'patient_nbr', 'encounter_id']
        df_raw.drop(to_drop, axis=1, inplace=True, errors = 'ignore')
        df_raw = df_raw.replace('?', np.nan) 
        spec_counts_raw = {"specs": ['InternalMedicine', 'Emergency/Trauma', 'Family/GeneralPractice',
            'Cardiology', 'Surgery-General'], "num patients": [14635,  7565,  7440,  5352,  3099]}

        spec_counts = pd.DataFrame(spec_counts_raw, columns = ['specs', "num patients"]).set_index(["specs"])
        spec_thresh = 5
        for (spec, count) in spec_counts.head(spec_thresh).iteritems():
            new_col = 'spec_' + str(spec)
            df_raw[new_col] = (df_raw.medical_specialty == spec)

        diag_counts_raw = {"icd9value": ['428', '250', '276', '414', '401', '427', '599', '496', '403', '486'], 'num patients w diag': [18101., 17861., 13816., 12895., 12371., 11757.,  6824.,  5990.,
        5693.,  5455.]}

        diag_counts = pd.DataFrame(diag_counts_raw, columns = [ 'icd9value', 'num patients w diag']).set_index(['icd9value'])

        diag_thresh = 10
        for (icd9, count) in diag_counts.head(diag_thresh).iteritems():
            new_col = 'diag_' + str(icd9)
            # print("df_raw.diag_1, icd9",df_raw.diag_1, icd9, type(df_raw.diag_1.to_string()), type(icd9))
            df_raw[new_col] = (df_raw.diag_1.to_string() == icd9)|(df_raw.diag_2.to_string() == icd9)|(df_raw.diag_3.to_string() == icd9)


        df_raw2 = pd.DataFrame(df_raw, copy=True) #preserve df_raw so I can rerun this step
        df_raw2['age'] = df_raw2.age.str.extract('(\d+)-\d+')

        to_drop = ['acetohexamide', 'troglitazone', 'examide', 'citoglipton',
            'glipizide-metformin', 'glimepiride-pioglitazone',
            'metformin-pioglitazone', 'weight', 'medical_specialty', 'diag_2',
            'diag_1', 'diag_3', 'patient_nbr', 'encounter_id']
        df_raw2.drop(to_drop, axis=1, inplace=True,errors = 'ignore')

        #break out categorical variables into binaries
        cat_cols = ['gender', 'tolbutamide', 'acarbose', 'miglitol', 'tolazamide',
            'metformin-rosiglitazone', 'change', 'diabetesMed',
            'glyburide-metformin', 'max_glu_serum', 'A1Cresult',
            'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
            'glimepiride', 'glipizide', 'glyburide', 'pioglitazone',
            'rosiglitazone', 'insulin', 'race', 'admission_type_id',
            'admission_source_id', 'payer_code', 'discharge_disposition_id']
        target_cols = ['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'spec_num patients', 'diag_num patients w diag', 'gender_Female', 'gender_Male', 'gender_Unknown/Invalid', 'tolbutamide_No', 'tolbutamide_Steady', 'acarbose_Down', 'acarbose_No', 'acarbose_Steady', 'acarbose_Up', 'miglitol_Down', 'miglitol_No', 'miglitol_Steady', 'miglitol_Up', 'tolazamide_No', 'tolazamide_Steady', 'tolazamide_Up', 'metformin-rosiglitazone_No', 'metformin-rosiglitazone_Steady', 'change_Ch', 'change_No', 'diabetesMed_No', 'diabetesMed_Yes', 'glyburide-metformin_Down', 'glyburide-metformin_No', 'glyburide-metformin_Steady', 'glyburide-metformin_Up', 'max_glu_serum_>200', 'max_glu_serum_>300', 'max_glu_serum_None', 'max_glu_serum_Norm', 'A1Cresult_>7', 'A1Cresult_>8', 'A1Cresult_None', 'A1Cresult_Norm', 'metformin_Down', 'metformin_No', 'metformin_Steady', 'metformin_Up', 'repaglinide_Down', 'repaglinide_No', 'repaglinide_Steady', 'repaglinide_Up', 'nateglinide_Down', 'nateglinide_No', 'nateglinide_Steady', 'nateglinide_Up', 'chlorpropamide_Down', 'chlorpropamide_No', 'chlorpropamide_Steady', 'chlorpropamide_Up', 'glimepiride_Down', 'glimepiride_No', 'glimepiride_Steady', 'glimepiride_Up', 'glipizide_Down', 'glipizide_No', 'glipizide_Steady', 'glipizide_Up', 'glyburide_Down', 'glyburide_No', 'glyburide_Steady', 'glyburide_Up', 'pioglitazone_Down', 'pioglitazone_No', 'pioglitazone_Steady', 'pioglitazone_Up', 'rosiglitazone_Down', 'rosiglitazone_No', 'rosiglitazone_Steady', 'rosiglitazone_Up', 'insulin_Down', 'insulin_No', 'insulin_Steady', 'insulin_Up', 'race_AfricanAmerican', 'race_Asian', 'race_Caucasian', 'race_Hispanic', 'race_Other', 'admission_type_id_1', 'admission_type_id_2', 'admission_type_id_3', 'admission_type_id_4', 'admission_type_id_5', 'admission_type_id_6', 'admission_type_id_7', 'admission_type_id_8', 'admission_source_id_1', 'admission_source_id_2', 'admission_source_id_3', 'admission_source_id_4', 'admission_source_id_5', 'admission_source_id_6', 'admission_source_id_7', 'admission_source_id_8', 'admission_source_id_9', 'admission_source_id_10', 'admission_source_id_11', 'admission_source_id_13', 'admission_source_id_14', 'admission_source_id_17', 'admission_source_id_20', 'admission_source_id_22', 'admission_source_id_25', 'payer_code_BC', 'payer_code_CH', 'payer_code_CM', 'payer_code_CP', 'payer_code_DM', 'payer_code_FR', 'payer_code_HM', 'payer_code_MC', 'payer_code_MD', 'payer_code_MP', 'payer_code_OG', 'payer_code_OT', 'payer_code_PO', 'payer_code_SI', 'payer_code_SP', 'payer_code_UN', 'payer_code_WC', 'discharge_disposition_id_1', 'discharge_disposition_id_2', 'discharge_disposition_id_3', 'discharge_disposition_id_4', 'discharge_disposition_id_5', 'discharge_disposition_id_6', 'discharge_disposition_id_7', 'discharge_disposition_id_8', 'discharge_disposition_id_9', 'discharge_disposition_id_10', 'discharge_disposition_id_11', 'discharge_disposition_id_12', 'discharge_disposition_id_13', 'discharge_disposition_id_14', 'discharge_disposition_id_15', 'discharge_disposition_id_16', 'discharge_disposition_id_17', 'discharge_disposition_id_18', 'discharge_disposition_id_19', 'discharge_disposition_id_20', 'discharge_disposition_id_22', 'discharge_disposition_id_23', 'discharge_disposition_id_24', 'discharge_disposition_id_25', 'discharge_disposition_id_27', 'discharge_disposition_id_28', 'is_readmitted']
        df_raw2 = pd.get_dummies(df_raw2, columns=cat_cols)
        nonexisted_cols = list(set(target_cols) - set(df_raw2.columns))
        # print("nonexisted_cols",nonexisted_cols, len(nonexisted_cols))
        for col in nonexisted_cols:
            df_raw2.insert(0, col, 0)
        #dropping these leaves up with one binary variable, ideal for simplicity
        df_raw2.drop(['readmitted_<30','readmitted_>30'], axis=1, inplace=True, errors = 'ignore')

        df_raw2.drop('readmitted_NO', axis=1, inplace=True,errors = 'ignore')


        print("df_raw2 is", df_raw2)
        result = model.predict_proba(df_raw2) 
    except Exception as e:
        result = str(e)
        return json.dumps({"error": result})
    return json.dumps({"result":result.tolist()})

In [None]:
ml_run = AutoMLRun(experiment = experiment, run_id = local_run.id)

In [None]:
dependencies = ml_run.get_run_sdk_dependencies(iteration = 4)

In [None]:
for p in ['azureml-train-automl', 'azureml-sdk', 'azureml-core']:
    print('{}\t{}'.format(p, dependencies[p]))

In [None]:
from azureml.core.conda_dependencies import CondaDependencies

myenv = CondaDependencies.create(conda_packages=['numpy','scikit-learn'], pip_packages=['azureml-sdk[automl]'])

conda_env_file_name = 'myenv.yml'
myenv.save_to_file('.', conda_env_file_name)

In [None]:
# Substitute the actual version number in the environment file.
# This is not strictly needed in this notebook because the model should have been generated using the current SDK version.
# However, we include this in case this code is used on an experiment from a previous SDK version.

with open(conda_env_file_name, 'r') as cefr:
    content = cefr.read()

with open(conda_env_file_name, 'w') as cefw:
    cefw.write(content.replace(azureml.core.VERSION, dependencies['azureml-sdk']))

# Substitute the actual model id in the script file.

script_file_name = 'score.py'

with open(script_file_name, 'r') as cefr:
    content = cefr.read()

with open(script_file_name, 'w') as cefw:
    cefw.write(content.replace('<<modelid>>', local_run.model_id))

# create container image

In [None]:
from azureml.core.image import Image, ContainerImage

image_config = ContainerImage.image_configuration(runtime= "python",
                                 execution_script = script_file_name,
                                 conda_file = conda_env_file_name,
                                 tags = {'area': "digits", 'type': "automl_classification"},
                                 description = "Image for ACE PatientHub Diabetes Analysis")

image = Image.create(name = "patienthubdiabetesanalysis",
                     # this is the model object 
                     models = [model],
                     image_config = image_config, 
                     workspace = ws)

image.wait_for_creation(show_output = True)

if image.creation_state == 'Failed':
    print("Image build log at: " + image.image_build_log_uri)

In [None]:
from azureml.core.compute import AksCompute, ComputeTarget
from azureml.core.webservice import Webservice, AksWebservice
# Use the default configuration (can also provide parameters to customize)
prov_config = AksCompute.provisioning_configuration()

aks_name = 'ace-patienthub' 
# Create the cluster
aks_target = ComputeTarget.create(workspace = ws, 
                                  name = aks_name, 
                                  provisioning_configuration = prov_config)

In [None]:
%%time
aks_target.wait_for_completion(show_output = True)
print(aks_target.provisioning_state)
print(aks_target.provisioning_errors)

In [None]:

#Set the web service configuration (using default here)
aks_config = AksWebservice.deploy_configuration(collect_model_data=True, enable_app_insights=True)

In [None]:
webservicelist = Webservice.list(workspace = ws)
webservicelist

In [None]:
%%time
aks_service_name ='ace-patienthub-analysis-0222'

aks_service = Webservice.deploy_from_image(workspace = ws, 
                                           name = aks_service_name,
                                           image = image,
                                           deployment_config = aks_config,
                                           deployment_target = aks_target)
aks_service.wait_for_deployment(show_output = True)
print(aks_service.state)

In [None]:
ws

In [None]:
services = Webservice.list(ws)
# print(services[0].scoring_uri)
for i in services:
    print(i.scoring_uri)

# test with data

In [None]:
X_train[1:2].values

In [None]:
%%time
import json

test_sample = json.dumps({'data': X_train[1:2].values.tolist()})
test_sample = bytes(test_sample,encoding = 'utf8')

if aks_service.state == "Healthy":
    prediction = aks_service.run(input_data=test_sample)
    print(prediction)
else:
    raise ValueError("Service deployment isn't healthy, can't call the service")

In [None]:
import requests
import requests
import json

# URL for the web service
scoring_uri = 'http://51.143.97.86/api/v1/service/ace-patienthub-analysis-0222/score'
# If the service is authenticated, set the key
key = 'lLeRwqzT33O2Yb7VhonaXkESmYdqHNdh'

item_to_score = ['Caucasian', 'Female', '[0-10)', '?', 6, 25, 1,
        1, '?', 'Pediatrics-Endocrinology', 41, 0, 1, 0, 0, 0, '250.83',
        '?', '?', 1, 'None', 'None', 'No', 'No', 'No', 'No', 'No', 'No',
        'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
        'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No']

item_to_score_string = [str(i) for i in item_to_score]
print(item_to_score_string)
# Two sets of data to score, so we get two results back
data = {"data": 
            [
                item_to_score_string
            ]
        }
# Convert to JSON string
input_data = json.dumps(data)

# Set the content type
headers = { 'Content-Type':'application/json' }
# If authentication is enabled, set the authorization header
headers['Authorization']=f'Bearer {key}'

# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers = headers)
print(resp.text)

# test

In [None]:
data = json.loads(input_data)['data']

In [None]:
data

In [None]:
df_raw = pd.DataFrame(data=data, columns=['race', 'gender', 'age', 'weight',
        'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
        'time_in_hospital', 'payer_code', 'medical_specialty',
        'num_lab_procedures', 'num_procedures', 'num_medications',
        'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
        'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
        'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
        'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
        'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
        'tolazamide', 'examide', 'citoglipton', 'insulin',
        'glyburide-metformin', 'glipizide-metformin',
        'glimepiride-pioglitazone', 'metformin-rosiglitazone',
        'metformin-pioglitazone', 'change', 'diabetesMed'])
to_drop = ['acetohexamide', 'troglitazone', 'examide', 'citoglipton',
'glipizide-metformin', 'glimepiride-pioglitazone',
'metformin-pioglitazone', 'weight', 'patient_nbr', 'encounter_id']
df_raw.drop(to_drop, axis=1, inplace=True, errors = 'ignore')
df_raw = df_raw.replace('?', np.nan) 
spec_counts_raw = {"specs": ['InternalMedicine', 'Emergency/Trauma', 'Family/GeneralPractice',
       'Cardiology', 'Surgery-General'], "num patients": [14635,  7565,  7440,  5352,  3099]}

spec_counts = pd.DataFrame(spec_counts_raw, columns = ['specs', "num patients"]).set_index(["specs"])
spec_thresh = 5
for (spec, count) in spec_counts.head(spec_thresh).iteritems():
    new_col = 'spec_' + str(spec)
    df_raw[new_col] = (df_raw.medical_specialty == spec)

diag_counts_raw = {"icd9value": ['428', '250', '276', '414', '401', '427', '599', '496', '403', '486'], 'num patients w diag': [18101., 17861., 13816., 12895., 12371., 11757.,  6824.,  5990.,
5693.,  5455.]}

diag_counts = pd.DataFrame(diag_counts_raw, columns = [ 'icd9value', 'num patients w diag']).set_index(['icd9value'])

diag_thresh = 10
for (icd9, count) in diag_counts.head(diag_thresh).iteritems():
    new_col = 'diag_' + str(icd9)
    # print("df_raw.diag_1, icd9",df_raw.diag_1, icd9, type(df_raw.diag_1.to_string()), type(icd9))
    df_raw[new_col] = (df_raw.diag_1.to_string() == icd9)|(df_raw.diag_2.to_string() == icd9)|(df_raw.diag_3.to_string() == icd9)


df_raw2 = pd.DataFrame(df_raw, copy=True) #preserve df_raw so I can rerun this step
df_raw2['age'] = df_raw2.age.str.extract('(\d+)-\d+')

to_drop = ['acetohexamide', 'troglitazone', 'examide', 'citoglipton',
       'glipizide-metformin', 'glimepiride-pioglitazone',
       'metformin-pioglitazone', 'weight', 'medical_specialty', 'diag_2',
       'diag_1', 'diag_3', 'patient_nbr', 'encounter_id']
df_raw2.drop(to_drop, axis=1, inplace=True,errors = 'ignore')

#break out categorical variables into binaries
cat_cols = ['gender', 'tolbutamide', 'acarbose', 'miglitol', 'tolazamide',
       'metformin-rosiglitazone', 'change', 'diabetesMed',
       'glyburide-metformin', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'glipizide', 'glyburide', 'pioglitazone',
       'rosiglitazone', 'insulin', 'race', 'admission_type_id',
       'admission_source_id', 'payer_code', 'discharge_disposition_id']
target_cols = ['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'spec_num patients', 'diag_num patients w diag', 'gender_Female', 'gender_Male', 'gender_Unknown/Invalid', 'tolbutamide_No', 'tolbutamide_Steady', 'acarbose_Down', 'acarbose_No', 'acarbose_Steady', 'acarbose_Up', 'miglitol_Down', 'miglitol_No', 'miglitol_Steady', 'miglitol_Up', 'tolazamide_No', 'tolazamide_Steady', 'tolazamide_Up', 'metformin-rosiglitazone_No', 'metformin-rosiglitazone_Steady', 'change_Ch', 'change_No', 'diabetesMed_No', 'diabetesMed_Yes', 'glyburide-metformin_Down', 'glyburide-metformin_No', 'glyburide-metformin_Steady', 'glyburide-metformin_Up', 'max_glu_serum_>200', 'max_glu_serum_>300', 'max_glu_serum_None', 'max_glu_serum_Norm', 'A1Cresult_>7', 'A1Cresult_>8', 'A1Cresult_None', 'A1Cresult_Norm', 'metformin_Down', 'metformin_No', 'metformin_Steady', 'metformin_Up', 'repaglinide_Down', 'repaglinide_No', 'repaglinide_Steady', 'repaglinide_Up', 'nateglinide_Down', 'nateglinide_No', 'nateglinide_Steady', 'nateglinide_Up', 'chlorpropamide_Down', 'chlorpropamide_No', 'chlorpropamide_Steady', 'chlorpropamide_Up', 'glimepiride_Down', 'glimepiride_No', 'glimepiride_Steady', 'glimepiride_Up', 'glipizide_Down', 'glipizide_No', 'glipizide_Steady', 'glipizide_Up', 'glyburide_Down', 'glyburide_No', 'glyburide_Steady', 'glyburide_Up', 'pioglitazone_Down', 'pioglitazone_No', 'pioglitazone_Steady', 'pioglitazone_Up', 'rosiglitazone_Down', 'rosiglitazone_No', 'rosiglitazone_Steady', 'rosiglitazone_Up', 'insulin_Down', 'insulin_No', 'insulin_Steady', 'insulin_Up', 'race_AfricanAmerican', 'race_Asian', 'race_Caucasian', 'race_Hispanic', 'race_Other', 'admission_type_id_1', 'admission_type_id_2', 'admission_type_id_3', 'admission_type_id_4', 'admission_type_id_5', 'admission_type_id_6', 'admission_type_id_7', 'admission_type_id_8', 'admission_source_id_1', 'admission_source_id_2', 'admission_source_id_3', 'admission_source_id_4', 'admission_source_id_5', 'admission_source_id_6', 'admission_source_id_7', 'admission_source_id_8', 'admission_source_id_9', 'admission_source_id_10', 'admission_source_id_11', 'admission_source_id_13', 'admission_source_id_14', 'admission_source_id_17', 'admission_source_id_20', 'admission_source_id_22', 'admission_source_id_25', 'payer_code_BC', 'payer_code_CH', 'payer_code_CM', 'payer_code_CP', 'payer_code_DM', 'payer_code_FR', 'payer_code_HM', 'payer_code_MC', 'payer_code_MD', 'payer_code_MP', 'payer_code_OG', 'payer_code_OT', 'payer_code_PO', 'payer_code_SI', 'payer_code_SP', 'payer_code_UN', 'payer_code_WC', 'discharge_disposition_id_1', 'discharge_disposition_id_2', 'discharge_disposition_id_3', 'discharge_disposition_id_4', 'discharge_disposition_id_5', 'discharge_disposition_id_6', 'discharge_disposition_id_7', 'discharge_disposition_id_8', 'discharge_disposition_id_9', 'discharge_disposition_id_10', 'discharge_disposition_id_11', 'discharge_disposition_id_12', 'discharge_disposition_id_13', 'discharge_disposition_id_14', 'discharge_disposition_id_15', 'discharge_disposition_id_16', 'discharge_disposition_id_17', 'discharge_disposition_id_18', 'discharge_disposition_id_19', 'discharge_disposition_id_20', 'discharge_disposition_id_22', 'discharge_disposition_id_23', 'discharge_disposition_id_24', 'discharge_disposition_id_25', 'discharge_disposition_id_27', 'discharge_disposition_id_28', 'is_readmitted']
df_raw2 = pd.get_dummies(df_raw2, columns=cat_cols)
nonexisted_cols = list(set(target_cols) - set(df_raw2.columns))
# print("nonexisted_cols",nonexisted_cols, len(nonexisted_cols))
for col in nonexisted_cols:
    df_raw2.insert(0, col, 0)
#dropping these leaves up with one binary variable, ideal for simplicity
df_raw2.drop(['readmitted_<30','readmitted_>30'], axis=1, inplace=True, errors = 'ignore')

df_raw2.drop('readmitted_NO', axis=1, inplace=True,errors = 'ignore')


# print("df_raw2 is", df_raw2)

In [None]:

# to_drop = ['acetohexamide', 'troglitazone', 'examide', 'citoglipton',
# 'glipizide-metformin', 'glimepiride-pioglitazone',
# 'metformin-pioglitazone', 'weight', 'patient_nbr', 'encounter_id']
# df_raw.drop(to_drop, axis=1, inplace=True, errors = 'ignore')
# df_raw = df_raw.replace('?', np.nan) 

In [None]:
fitted_model.predict_proba(X=df_raw2)

# model explanation

In [None]:
import sklearn
import shap
from sklearn.model_selection import train_test_split

# print the JS visualization code to the notebook
shap.initjs()

transformer = fitted_model.named_steps['datatransformer']
estimator = fitted_model.named_steps['prefittedsoftvotingclassifier']

test_row = X_test[1:10]
transformer_row = transformer.transform(test_row)

print(estimator.predict_proba(transformer_row))

explainer = shap.KernelExplainer(estimator.predict_proba,data=transformer_row, link="logit")

# use Kernel SHAP to explain test set predictions

shap_values = explainer.shap_values(transformer_row, nsamples=9)

# plot the SHAP values for the Setosa output of the first instance
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], transformer_row[0].A, link="logit")

In [None]:
from azureml.train.automl.automlexplainer import explain_model

shap_values, expected_values, overall_summary, overall_imp, per_class_summary, per_class_imp = \
    explain_model(fitted_model, X_train, X_test)

#Overall feature importance
print(overall_imp)
print(overall_summary)

#Class-level feature importance
print(per_class_imp)
print(per_class_summary)

In [None]:
from azureml.widgets import RunDetails
RunDetails(local_run).show()

# Deprecated

# Others

In [None]:
feat_ranks = pd.Series(index=X.columns, data=rfe.ranking_)
rf_feats = feat_ranks[feat_ranks==1].index
len(feat_ranks)

In [None]:
len(rf_feats)

In [None]:
X_test_red = rfe.transform(X_test)
X_train_red = rfe.transform(X_train)

rfc = rfe.estimator_

feat_imp = pd.Series(index=rf_feats, data=rfc.feature_importances_)
feat_imp.sort_values(ascending=False).head(20)

# Assess prediction accuracy

In [None]:
#%% assess accuracy
pred = rfc.predict_proba(X_test_red)[:,1]
fpr, tpr, threshold = sklearn.metrics.roc_curve(y_test, pred, drop_intermediate=True)    
df_res = pd.DataFrame(data={'fpr':fpr, 'tpr':tpr, 'threshold':threshold})
df_res = df_res[['threshold','fpr','tpr']]
sklearn.metrics.auc(fpr, tpr)
t=y.value_counts()[1]/y.value_counts().sum()
sklearn.metrics.f1_score(y_test, pred>t)
sklearn.metrics.accuracy_score(y_test, pred>t)

roc_auc(pred, y_test)

In [None]:
#TODO - visualize accuracy metrics based on threshold
pd.options.mode.chained_assignment = None
pred = rfc.predict_proba(X_test_red)[:,1]
fpr, tpr, threshold = sklearn.metrics.roc_curve(y_test, pred, drop_intermediate=True)    
df_res = pd.DataFrame(data={'fpr':fpr, 'tpr':tpr, 'threshold':threshold})
df_res = df_res[['threshold','fpr','tpr']]
sklearn.metrics.auc(fpr, tpr)

In [None]:
df_res['accuracy'] = df_res.threshold.apply(lambda t: sklearn.metrics.accuracy_score(y_test, pred>t))
df_res['precision'] = df_res.threshold.apply(lambda t: sklearn.metrics.precision_score(y_test, pred>t))
df_res['recall'] = df_res.threshold.apply(lambda t: sklearn.metrics.recall_score(y_test, pred>t))
df_res['f1'] = df_res.threshold.apply(lambda t: sklearn.metrics.f1_score(y_test, pred>t))
df_res['specificity'] = df_res.fpr.apply(lambda fpr: 1-fpr)

pt_opt = df_res[df_res.f1 == df_res.f1.max()].iloc[0]
pt_opt

In [None]:
plt.rcParams["figure.figsize"] = (20,6)
df_res.plot(x='threshold')

# Generate Decision tree based on the top features

In [None]:
from sklearn.externals.six import StringIO  

dtc = DecisionTreeClassifier(min_samples_leaf=0.125, min_samples_split=0.125)
dtc.fit(X_train_red, y_train)
from sklearn import tree


In [None]:
from IPython.display import Image
from subprocess import check_call
tree.export_graphviz(dtc, out_file="tree.dot", feature_names=rf_feats, proportion=True)
check_call(['dot','-Tpng','tree.dot','-o','tree.png'])
Image(filename='tree.png')

In [None]:
import numpy as np
import json
import pandas as pd
item_to_score = ['Caucasian', 'Female', '[0-10)', '?', 6, 25, 1,
        1, '?', 'InternalMedicine', 41, 0, 1, 0, 0, 0, '250.83',
        '?', '?', 1, 'None', 'None', 'No', 'No', 'No', 'No', 'No', 'No',
        'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
        'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No']

item_to_score_string = [str(i) for i in item_to_score]
print(item_to_score_string)
# Two sets of data to score, so we get two results back
rawdata = {"data": 
            [
                item_to_score_string
            ]
        }
input_data = json.dumps(rawdata)
data = json.loads(input_data)['data']
#print("data is", data, "len is", len(data))
df_raw = pd.DataFrame(data=data, columns=['race', 'gender', 'age', 'weight',
'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
'time_in_hospital', 'payer_code', 'medical_specialty',
'num_lab_procedures', 'num_procedures', 'num_medications',
'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
'tolazamide', 'examide', 'citoglipton', 'insulin',
'glyburide-metformin', 'glipizide-metformin',
'glimepiride-pioglitazone', 'metformin-rosiglitazone',
'metformin-pioglitazone', 'change', 'diabetesMed'])
to_drop = ['acetohexamide', 'troglitazone', 'examide', 'citoglipton',
'glipizide-metformin', 'glimepiride-pioglitazone',
'metformin-pioglitazone', 'weight', 'patient_nbr', 'encounter_id']
df_raw.drop(to_drop, axis=1, inplace=True, errors = 'ignore')
df_raw = df_raw.replace('?', np.nan) 
spec_counts_raw = {"specs": ['InternalMedicine', 'Emergency/Trauma', 'Family/GeneralPractice',
    'Cardiology', 'Surgery-General'], "num patients": [14635,  7565,  7440,  5352,  3099]}

spec_counts = pd.DataFrame(spec_counts_raw, columns = ['specs', "num patients"]).set_index(["specs"])
spec_thresh = 5
for (spec, count) in spec_counts.head(spec_thresh).iteritems():
    new_col = 'spec_' + str(spec)
    print(spec)
    df_raw[new_col] = (df_raw.medical_specialty == spec)
#     print(df_raw[new_col])
df_raw.columns