Copyright (c) Microsoft Corporation.

Licensed under the MIT License.

# DISCLAIMER
By accessing this code, you acknowledge that the code is not designed, intended, or made available: (1) as a medical device(s); (2) for the diagnosis of disease or other conditions, or in the cure, mitigation, treatment or prevention of a disease or other conditions; or (3) as a substitute for professional medical advice, diagnosis, treatment, or judgment. Do not use this code to replace, substitute, or provide professional medical advice, diagnosis, treatment, or judgement. You are solely responsible for ensuring the regulatory, legal, and/or contractual compliance of any use of the code, including obtaining any authorizations or consents, and any solution you choose to build that incorporates this code in whole or in part.


# Library Imports


In [None]:
import numpy as np
import pandas as pd
from pyspark.sql.types import *


data_lake_account_name = ''
file_system_name = 'raw'

subscription_id = "" 
resource_group = "" 
workspace_name = "" 
workspace_region = ""

In [None]:
# set transformed data schema
transformedSchema = StructType([StructField("race", StringType(), True), 
                    StructField("gender", StringType(), True), 
                    StructField("age", StringType(), True) , 
                    StructField("admission_type_id", StringType(), True), 
                    StructField("discharge_disposition_id", StringType(), True), 
                    StructField("admission_source_id", StringType(), True), 
                    StructField("time_in_hospital", StringType(), True), 
                    StructField("payer_code", StringType(), True), 
                    StructField("num_lab_procedures", StringType(), True), 
                    StructField("num_procedures", StringType(), True), 
                    StructField("num_medications", StringType(), True),
                    StructField("number_outpatient", StringType(), True), 
                    StructField("number_emergency", StringType(), True), 
                    StructField("number_inpatient", StringType(), True), 
                    StructField("number_diagnoses", StringType(), True), 
                    StructField("max_glu_serum", StringType(), True), 
                    StructField("A1Cresult", StringType(), True), 
                    StructField("metformin", StringType(), True), 
                    StructField("repaglinide", StringType(), True), 
                    StructField("nateglinide", StringType(), True), 
                    StructField("chlorpropamide", StringType(), True), 
                    StructField("glimepiride", StringType(), True),
                    StructField("glipizide", StringType(), True), 
                    StructField("glyburide", StringType(), True), 
                    StructField("tolbutamide", StringType(), True), 
                    StructField("pioglitazone", StringType(), True), 
                    StructField("rosiglitazone", StringType(), True), 
                    StructField("acarbose", StringType(), True), 
                    StructField("miglitol", StringType(), True), 
                    StructField("tolazamide", StringType(), True),
                    StructField("insulin", StringType(), True), 
                    StructField("glyburide-metformin", StringType(), True), 
                    StructField("metformin-rosiglitazone", StringType(), True), 
                    StructField("change", StringType(), True), 
                    StructField("diabetesMed", StringType(), True), 
                    StructField("FirstName", StringType(), True), 
                    StructField("LastName", StringType(), True),
                    StructField("Id", StringType(), True), 
                    StructField("spec_InternalMedicine", BooleanType(), True), 
                    StructField("spec_Emergency/Trauma", BooleanType(), True),
                    StructField("spec_Family/GeneralPractice", BooleanType(), True), 
                    StructField("spec_Cardiology", BooleanType(), True), 
                    StructField("spec_Surgery-General", BooleanType(), True), 
                    StructField("diag_428", BooleanType(), True), 
                    StructField("diag_250", BooleanType(), True), 
                    StructField("diag_276", BooleanType(), True), 
                    StructField("diag_414", BooleanType(), True), 
                    StructField("diag_401", BooleanType(), True),  
                    StructField("diag_427", BooleanType(), True), 
                    StructField("diag_599", BooleanType(), True), 
                    StructField("diag_496", BooleanType(), True), 
                    StructField("diag_403", BooleanType(), True), 
                    StructField("diag_486", BooleanType(), True),  
                    StructField("is_readmitted", BooleanType(), True)
                    ])

# Read in Training Data from Azure Data Lake 

In [None]:
from sklearn.model_selection import train_test_split 
import pandas as pd

df_train = spark.read.format("csv").load(f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/DatasetDiabetes/preparedtraindata/",header=True,schema=transformedSchema)
df_train = df_train.toPandas()

outcome_column = 'is_readmitted'

id_column = 'Id'
df_train = df_train.drop(id_column,axis=1) 

In [None]:
df_train['is_readmitted'].value_counts()

# Define and Train the Model 


In [None]:
import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.dataset import Dataset

from azureml.train.automl.run import AutoMLRun
from azureml.train.automl import AutoMLConfig
from azureml.automl.runtime.onnx_convert import OnnxConverter
from azureml.core.model import Model
from azureml.core import Environment
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core.webservice import Webservice

ws = Workspace(workspace_name = workspace_name,
               subscription_id = subscription_id,
               resource_group = resource_group)
ws.write_config()   

# increase  the interation and experiment_timeout_hours as needed 
automl_settings = {
    "iterations": 20,
    "n_cross_validations": 5,
    "primary_metric": 'AUC_weighted',
    "enable_early_stopping": True,
    "max_concurrent_iterations": 5, 
    "model_explainability":True,
    "experiment_timeout_hours": 0.25
}
automl_config = AutoMLConfig(task = 'classification',
                             training_data = df_train,
                             label_column_name = 'is_readmitted',
                             **automl_settings
                            )
experiment = Experiment(ws, "DiabetesPredictionExperiment")

In [None]:
local_run = experiment.submit(automl_config, show_output=True)