Copyright (c) Microsoft Corporation. 

Licensed under the MIT license.

# DISCLAIMER
By accessing this code, you acknowledge that the code is not designed, intended, or made available: (1) as a medical device(s); (2) for the diagnosis of disease or other conditions, or in the cure, mitigation, treatment or prevention of a disease or other conditions; or (3) as a substitute for professional medical advice, diagnosis, treatment, or judgment. Do not use this code to replace, substitute, or provide professional medical advice, diagnosis, treatment, or judgement. You are solely responsible for ensuring the regulatory, legal, and/or contractual compliance of any use of the code, including obtaining any authorizations or consents, and any solution you choose to build that incorporates this code in whole or in part.


In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [None]:
import numpy as np
import pandas as pd

data_lake_account_name = ''
file_system_name = 'raw'

# aml workspace config
subscription_id = "" 
resource_group = "" 
workspace_name = "" 
workspace_region = ""

In [None]:
cosmosSchema = StructType([StructField("Id", StringType(), True), 
                    StructField("patient_nbr", StringType(), True), 
                    StructField("FirstName", StringType(), True), 
                    StructField("LastName", StringType(), True), 
                    StructField("race", StringType(), True) , 
                    StructField("gender", StringType(), True), 
                    StructField("age", StringType(), True), 
                    StructField("weight", StringType(), True), 
                    StructField("admission_type_id", StringType(), True), 
                    StructField("discharge_disposition_id", StringType(), True), 
                    StructField("admission_source_id", StringType(), True), 
                    StructField("time_in_hospital", StringType(), True),
                    StructField("payer_code", StringType(), True), 
                    StructField("medical_specialty", StringType(), True), 
                    StructField("num_lab_procedures", StringType(), True), 
                    StructField("num_procedures", StringType(), True), 
                    StructField("num_medications", StringType(), True), 
                    StructField("number_outpatient", StringType(), True), 
                    StructField("number_emergency", StringType(), True), 
                    StructField("number_inpatient", StringType(), True), 
                    StructField("diag_1", StringType(), True), 
                    StructField("diag_2", StringType(), True),
                    StructField("diag_3", StringType(), True), 
                    StructField("number_diagnoses", StringType(), True), 
                    StructField("max_glu_serum", StringType(), True), 
                    StructField("A1Cresult", StringType(), True), 
                    StructField("metformin", StringType(), True), 
                    StructField("repaglinide", StringType(), True), 
                    StructField("nateglinide", StringType(), True), 
                    StructField("chlorpropamide", StringType(), True),
                    StructField("glimepiride", StringType(), True), 
                    StructField("acetohexamide", StringType(), True), 
                    StructField("glipizide", StringType(), True), 
                    StructField("glyburide", StringType(), True), 
                    StructField("tolbutamide", StringType(), True), 
                    StructField("pioglitazone", StringType(), True), 
                    StructField("rosiglitazone", StringType(), True),
                    StructField("acarbose", StringType(), True), 
                    StructField("miglitol", StringType(), True), 
                    StructField("troglitazone", StringType(), True), 
                    StructField("tolazamide", StringType(), True), 
                    StructField("examide", StringType(), True), 
                    StructField("citoglipton", StringType(), True), 
                    StructField("insulin", StringType(), True), 
                    StructField("glyburide-metformin", StringType(), True),  
                    StructField("glipizide-metformin", StringType(), True), 
                    StructField("glimepiride-pioglitazone", StringType(), True), 
                    StructField("metformin-rosiglitazone", StringType(), True), 
                    StructField("metformin-pioglitazone", StringType(), True), 
                    StructField("change", StringType(), True),  
                    StructField("diabetesMed", StringType(), True),
                    StructField("readmitted", StringType(), True)
                    ])

In [None]:
#save the model to a local file
import azureml.core

from azureml.core import Workspace
ws = Workspace(workspace_name = workspace_name,
               subscription_id = subscription_id,
               resource_group = resource_group)
ws.write_config()   

from azureml.core.model import Model

#from sklearn.externals import joblib
import joblib
model_name='diabetesmodel'
model_path = Model.get_model_path(model_name=model_name, _workspace=ws)
loaded_model = joblib.load(model_path)
print('model loaded!')


model_name='scoring_explainer.pkl'
model_path = Model.get_model_path(model_name=model_name, _workspace=ws)
scoring_explainer = joblib.load(model_path)
print('model explainer loaded!')

In [None]:
def create_additional_features(df): 
    to_drop = ['acetohexamide', 'troglitazone', 'examide', 'citoglipton',
           'glipizide-metformin', 'glimepiride-pioglitazone',
           'metformin-pioglitazone', 'weight', 'patient_nbr', 'encounter_id']
    df.drop(to_drop, axis=1, inplace=True, errors = 'ignore')
    df_transformed = df.replace('?', np.nan) 
    
    spec_counts_raw = {"specs": ['InternalMedicine', 'Emergency/Trauma', 'Family/GeneralPractice','Cardiology',
                       'Surgery-General'], "num patients": [14635,  7565,  7440,  5352,  3099]}

    df_transformed['medical_specialty'] = df_transformed['medical_specialty'].replace(np.nan, "NaNSpec")
    spec_counts = pd.DataFrame(data = spec_counts_raw)
    spec_thresh = 5
    for (index, row) in spec_counts.head(spec_thresh).iterrows():
        spec = row['specs']
        new_col = 'spec_' + str(spec)
        df_transformed[new_col] = (df_transformed.medical_specialty == spec)

    diag_counts_raw = {"icd9value": ['428', '250', '276', '414', '401', '427', '599', '496', '403', '486'],
                    'num patients w diag': [18101., 17861., 13816., 12895., 12371., 11757.,  6824.,  5990.,5693., 5455.]}

    diag_counts = pd.DataFrame(diag_counts_raw, columns = [ 'icd9value', 'num patients w diag'])

    diag_thresh = 10
    for (index, row) in diag_counts.head(diag_thresh).iterrows():
        icd9 = row['icd9value']
        new_col = 'diag_' + str(icd9)
        df_transformed[new_col] = (df_transformed.diag_1 == icd9)|(df_transformed.diag_2 == icd9)|(df_transformed.diag_3 == icd9)

    df_transformed = df_transformed.reset_index(drop=True)

    df_transformed2 = pd.DataFrame(df_transformed, copy=True) #preserve df_transformed so I can rerun this step
    df_transformed2['age'] = df_transformed2.age.str.extract('(\d+)-\d+')

    to_drop = ['acetohexamide', 'troglitazone', 'examide', 'citoglipton',
        'glipizide-metformin', 'glimepiride-pioglitazone',
        'metformin-pioglitazone', 'weight', 'medical_specialty', 'diag_2',
        'diag_1', 'diag_3', 'patient_nbr', 'encounter_id']
    df_transformed2.drop(to_drop, axis=1, inplace=True,errors = 'ignore')

    df_transformed2 = df_transformed2.reset_index(drop=True)

    #df_transformed2['readmitted'].value_counts()

    df = pd.DataFrame(df_transformed2)

    #Imputing with outlying value since we are focusing on tree based methods
    df = df.fillna(-9999) 

    df = df.reset_index(drop=True)
    df.dtypes
    
    return df

In [None]:
import pandas as pd
from azureml.train.automl.runtime.automl_explain_utilities import automl_setup_model_explanations

#read in from cosmos 
df_test = spark.read.format("cosmos.oltp")\
    .option("spark.synapse.linkedService", "patientHubDB")\
    .option("spark.cosmos.container", "Patient")\
    .load(schema=cosmosSchema)

df_test = df_test.toPandas()

#get the test data and drop output column
outcome_column = 'readmitted'
df_test = df_test.drop(outcome_column,axis=1)

#df_test = df_test.head(10)
df_test = create_additional_features(df_test)

# drop Id column
data_df = df_test  #.head(10)
id_column = 'Id'
data_df = data_df.drop(id_column,axis=1)

#get predictions
df_predictions = pd.DataFrame(loaded_model.predict_proba(data_df))
df_predictions.columns = ['False','True']

df_predictions = df_predictions[['True']]
df_predictions['patientId'] = df_test['Id']
df_predictions.columns = ['Prediction','patientId']
df_predictions = df_predictions[['patientId','Prediction']]
df_predictions_sp = spark.createDataFrame(df_predictions)  

#display(df_predictions_sp)

#write to ADLS and also save as spark table
df_predictions_sp.write.option('header', 'true').mode('overwrite').csv(f'abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/DatasetDiabetes/predictions/')
# df_predictions_sp.write.mode("overwrite").saveAsTable("default.diabetes_predictions")

#write predictions to CosmosDB
df_predictions_sp.write.format("cosmos.oltp")\
    .option("spark.synapse.linkedService", "patientHubDB")\
    .option("spark.cosmos.container", "Predictions")\
    .option("spark.cosmos.write.upsertEnabled", "true")\
    .mode('overwrite')\
    .save()

#get explanations and stack the results
automl_explainer_setup_obj = automl_setup_model_explanations(loaded_model,X_test=data_df, task='classification')
raw_local_importance_values = scoring_explainer.explain(automl_explainer_setup_obj.X_test_transform, get_raw=True)
raw_local_importance_values

df_exp = pd.DataFrame(raw_local_importance_values,columns=data_df.columns)
df_exp['patientId'] = df_test['Id']
df_exp.set_index("patientId", inplace = True)
df_exp
df_exp_stacked = pd.DataFrame(df_exp.stack().reset_index())
df_exp_stacked.columns = ['patientId','Feature','Score']
df_exp_stacked

df_exp_stacked_sp = spark.createDataFrame(df_exp_stacked) 

#display(df_exp_stacked_sp)

#write to ADLS and also save as spark table
df_exp_stacked_sp.write.option('header', 'true').mode('overwrite').csv(f'abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/DatasetDiabetes/explanations/')
# df_exp_stacked_sp.write.mode("overwrite").saveAsTable("default.diabetes_explanations")

#write explanations to CosmosDB
df_exp_stacked_sp.write.format("cosmos.oltp")\
    .option("spark.synapse.linkedService", "patientHubDB")\
    .option("spark.cosmos.container", "Explanations")\
    .option("spark.cosmos.write.upsertEnabled", "true")\
    .mode('overwrite')\
    .save()