In [None]:
pip install alibi

In [None]:
pip install alibi-detect

In [9]:
import pandas as pd
import numpy as np 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

import scipy.stats as stats
import matplotlib.pyplot as plt 
import seaborn as sns
import pickle 

import sklearn
from sklearn import metrics

import alibi 
from alibi_detect.cd import ChiSquareDrift, TabularDrift
from alibi_detect.saving import save_detector, load_detector

import sqlalchemy
import snowflake.connector
from sqlalchemy import create_engine
from snowflake.sqlalchemy import *

import xgboost
from datetime import datetime, timedelta
import time  
import pytz    
tz_NY = pytz.timezone('Asia/Kolkata')

import snowflake_creds

import warnings
warnings.filterwarnings('ignore')

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


ModuleNotFoundError: No module named 'snowflake_creds'

In [None]:
snowflake_creds = {
    'USER_NAME': 'omarhamzic',
    'PASSWORD': 'Werewolf123!@#'
}

In [None]:
# Creating the connection engine (way 1)
engine = create_engine(URL(
        account="cr21746.ap-south-1",
        user= snowflake_creds.USER_NAME,
        password= snowflake_creds.PASSWORD,
        role="ACCOUNTADMIN",
        warehouse="COMPUTE_WH",
        database="HEALTHDB",
        schema="HEALTHSCHEMA"
    ))

In [None]:
query = """

WITH BASE AS (
    SELECT
        CASE_ID,
        COALESCE(HOSPITAL_CODE, 0) AS HOSPITAL_CODE,
        COALESCE(HOSPITAL_TYPE_CODE, 'None') AS HOSPITAL_TYPE_CODE,
        COALESCE(CITY_CODE_HOSPITAL, 0) AS CITY_CODE_HOSPITAL,
        COALESCE(HOSPITAL_REGION_CODE, 'None') AS HOSPITAL_REGION_CODE,
        COALESCE(AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL, 0) AS AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,
        COALESCE(DEPARTMENT, 'None') AS DEPARTMENT,
        COALESCE(WARD_TYPE, 'None') AS WARD_TYPE,
        COALESCE(WARD_FACILITY_CODE, 'None') AS WARD_FACILITY_CODE,
        COALESCE(BED_GRADE, 0) AS BED_GRADE,
        PATIENTID,
        COALESCE(CITY_CODE_PATIENT, 0) AS CITY_CODE_PATIENT,
        COALESCE(TYPE_OF_ADMISSION, 'None') AS TYPE_OF_ADMISSION,
        COALESCE(SEVERITY_OF_ILLNESS, 'Minor') AS SEVERITY_OF_ILLNESS,
        COALESCE(VISITORS_WITH_PATIENT, 0) AS VISITORS_WITH_PATIENT,
        COALESCE(AGE, 'None') AS AGE,
        COALESCE(ADMISSION_DEPOSIT, 0) AS ADMISSION_DEPOSIT,
        ADMISSION_DATE,
        DISCHARGE_DATE
    FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
    WHERE ADMISSION_DATE IS NOT NULL AND DISCHARGE_DATE IS NOT NULL -- Ensure dates are valid
),
    
BASE_WITH_FEATURES AS (
    SELECT *,
        -- Extract year, month, and day as separate columns
        YEAR(ADMISSION_DATE) AS ADMISSION_YEAR,
        MONTH(ADMISSION_DATE) AS ADMISSION_MONTH,
        DAY(ADMISSION_DATE) AS ADMISSION_DAY,
        MONTHNAME(ADMISSION_DATE) AS ADMISSION_MONTH_NAME,
        DAYNAME(ADMISSION_DATE) AS ADMISSION_DAY_NAME,
        CONCAT(TYPE_OF_ADMISSION, '-', SEVERITY_OF_ILLNESS) AS ADMISSION_ILLNESS_COMB,
        CONCAT(SEVERITY_OF_ILLNESS, '-', CAST(BED_GRADE AS VARCHAR)) AS ILLNESS_BEDGRADE_COMB,
        CONCAT(DEPARTMENT, '-', SEVERITY_OF_ILLNESS) AS DEPARTMENT_ILLNESS_COMB,
        CASE -- Additional categorization using CASE statement
            WHEN DATEDIFF(day, ADMISSION_DATE, DISCHARGE_DATE) <= 7 THEN 'Short Stay'
            WHEN DATEDIFF(day, ADMISSION_DATE, DISCHARGE_DATE) <= 14 THEN 'Medium Stay'
            ELSE 'Long Stay'
        END AS STAY_DURATION,
        DATEDIFF(day, ADMISSION_DATE, DISCHARGE_DATE) AS LENGHTH_OF_STAY -- Length of Stay
    FROM BASE
    WHERE DISCHARGE_DATE >= ADMISSION_DATE -- Ensure logical discharge dates
)

SELECT * FROM BASE_WITH_FEATURES

"""

In [None]:
#Connect to the train data
with engine.connect() as conn:
    df_train = pd.DataFrame(pd.read_sql(query,conn))
    df_train.columns = [col.upper() for col in df_train.columns.tolist()]

In [10]:
'''
#Can load CSV directly to not waste AWS resources
data = pd.read_csv("Data/health_data.csv")
'''

In [11]:
'''

# Assuming you have a DataFrame called "data" with the appropriate columns
# Convert "ADMISSION_DATE" and "DISCHARGE_DATE" to datetime format
data['ADMISSION_DATE'] = pd.to_datetime(data['ADMISSION_DATE'])
data['DISCHARGE_DATE'] = pd.to_datetime(data['DISCHARGE_DATE'])

# Create a new DataFrame "base" with the COALESCE transformations
base = data.copy()  # Create a copy of the original DataFrame

base['HOSPITAL_CODE'] = base['HOSPITAL_CODE'].fillna(0)
base['HOSPITAL_TYPE_CODE'] = base['HOSPITAL_TYPE_CODE'].fillna('None')
base['CITY_CODE_HOSPITAL'] = base['CITY_CODE_HOSPITAL'].fillna(0)
base['HOSPITAL_REGION_CODE'] = base['HOSPITAL_REGION_CODE'].fillna('None')
base['AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL'] = base['AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL'].fillna(0)
base['DEPARTMENT'] = base['DEPARTMENT'].fillna('None')
base['WARD_TYPE'] = base['WARD_TYPE'].fillna('None')
base['WARD_FACILITY_CODE'] = base['WARD_FACILITY_CODE'].fillna('None')
base['BED_GRADE'] = base['BED_GRADE'].fillna(0)
base['CITY_CODE_PATIENT'] = base['CITY_CODE_PATIENT'].fillna(0)
base['TYPE_OF_ADMISSION'] = base['TYPE_OF_ADMISSION'].fillna('None')
base['SEVERITY_OF_ILLNESS'] = base['SEVERITY_OF_ILLNESS'].fillna('Minor')
base['VISITORS_WITH_PATIENT'] = base['VISITORS_WITH_PATIENT'].fillna(0)
base['AGE'] = base['AGE'].fillna('None')
base['ADMISSION_DEPOSIT'] = base['ADMISSION_DEPOSIT'].fillna(0)

# Create a new DataFrame "base_with_features" with additional transformations
base_with_features = base.copy()

base_with_features['ADMISSION_YEAR'] = base_with_features['ADMISSION_DATE'].dt.year
base_with_features['ADMISSION_MONTH'] = base_with_features['ADMISSION_DATE'].dt.month
base_with_features['ADMISSION_DAY'] = base_with_features['ADMISSION_DATE'].dt.day
base_with_features['ADMISSION_MONTH_NAME'] = base_with_features['ADMISSION_DATE'].dt.month_name()
base_with_features['ADMISSION_DAY_NAME'] = base_with_features['ADMISSION_DATE'].dt.day_name()
base_with_features['ADMISSION_ILLNESS_COMB'] = base_with_features['TYPE_OF_ADMISSION'] + '-' + base_with_features['SEVERITY_OF_ILLNESS']
base_with_features['ILLNESS_BEDGRADE_COMB'] = base_with_features['SEVERITY_OF_ILLNESS'] + '-' + base_with_features['BED_GRADE'].astype(str)
base_with_features['DEPARTMENT_ILLNESS_COMB'] = base_with_features['DEPARTMENT'] + '-' + base_with_features['SEVERITY_OF_ILLNESS']


base_with_features['LENGTH_OF_STAY'] = (base_with_features['DISCHARGE_DATE'] - base_with_features['ADMISSION_DATE']).dt.days

# Additional categorization using CASE statement
def categorize_stay_duration(row):
    duration = row['LENGTH_OF_STAY']
    if duration <= 7:
        return 'Short Stay'
    elif duration <= 14:
        return 'Medium Stay'
    else:
        return 'Long Stay'

base_with_features['STAY_DURATION'] = base_with_features.apply(categorize_stay_duration, axis=1)

# Now, you have the equivalent Python code for the SQL query with datetime conversion and corrected column name
result = base_with_features.copy()

# You can print or work with the "result" DataFrame as needed
print(result)

'''

        CASE_ID  HOSPITAL_CODE HOSPITAL_TYPE_CODE  CITY_CODE_HOSPITAL  \
0             1              8                  c                   3   
1             3             10                  e                   1   
2             4             26                  b                   2   
3             5             26                  b                   2   
4             6             23                  a                   6   
...         ...            ...                ...                 ...   
236699   255877             23                  a                   6   
236700   255879              3                  c                   3   
236701   255880              3                  c                   3   
236702   255882             19                  a                   7   
236703   255883             14                  a                   1   

       HOSPITAL_REGION_CODE  AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL    DEPARTMENT  \
0                         Z                

In [12]:
#df_train=result.copy()

In [13]:
print(df_train.shape)
df_train.head(3)

(236704, 29)


Unnamed: 0,CASE_ID,HOSPITAL_CODE,HOSPITAL_TYPE_CODE,CITY_CODE_HOSPITAL,HOSPITAL_REGION_CODE,AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,DEPARTMENT,WARD_TYPE,WARD_FACILITY_CODE,BED_GRADE,PATIENTID,CITY_CODE_PATIENT,TYPE_OF_ADMISSION,SEVERITY_OF_ILLNESS,VISITORS_WITH_PATIENT,AGE,ADMISSION_DEPOSIT,ADMISSION_DATE,DISCHARGE_DATE,ADMISSION_YEAR,ADMISSION_MONTH,ADMISSION_DAY,ADMISSION_MONTH_NAME,ADMISSION_DAY_NAME,ADMISSION_ILLNESS_COMB,ILLNESS_BEDGRADE_COMB,DEPARTMENT_ILLNESS_COMB,LENGTH_OF_STAY,STAY_DURATION
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911,2022-08-10,2022-08-20,2022,8,10,August,Wednesday,Emergency-Extreme,Extreme-2.0,radiotherapy-Extreme,10,Medium Stay
1,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745,2022-09-09,2022-10-19,2022,9,9,September,Friday,Trauma-Extreme,Extreme-2.0,anesthesia-Extreme,40,Long Stay
2,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272,2022-09-22,2022-11-11,2022,9,22,September,Thursday,Trauma-Extreme,Extreme-2.0,radiotherapy-Extreme,50,Long Stay


In [14]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236704 entries, 0 to 236703
Data columns (total 29 columns):
 #   Column                             Non-Null Count   Dtype         
---  ------                             --------------   -----         
 0   CASE_ID                            236704 non-null  int64         
 1   HOSPITAL_CODE                      236704 non-null  int64         
 2   HOSPITAL_TYPE_CODE                 236704 non-null  object        
 3   CITY_CODE_HOSPITAL                 236704 non-null  int64         
 4   HOSPITAL_REGION_CODE               236704 non-null  object        
 5   AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL  236704 non-null  int64         
 6   DEPARTMENT                         236704 non-null  object        
 7   WARD_TYPE                          236704 non-null  object        
 8   WARD_FACILITY_CODE                 236704 non-null  object        
 9   BED_GRADE                          236704 non-null  float64       
 10  PATIENTID           

In [15]:
num_columns = ['AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL','VISITORS_WITH_PATIENT','ADMISSION_DEPOSIT']
id_columns = ['CASE_ID','PATIENTID','ADMISSION_DATE','DISCHARGE_DATE']
cat_columns = [col for col in df_train.columns.tolist() if col not in num_columns+id_columns]

In [16]:
num_columns

['AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL',
 'VISITORS_WITH_PATIENT',
 'ADMISSION_DEPOSIT']

In [17]:
cat_columns

['HOSPITAL_CODE',
 'HOSPITAL_TYPE_CODE',
 'CITY_CODE_HOSPITAL',
 'HOSPITAL_REGION_CODE',
 'DEPARTMENT',
 'WARD_TYPE',
 'WARD_FACILITY_CODE',
 'BED_GRADE',
 'CITY_CODE_PATIENT',
 'TYPE_OF_ADMISSION',
 'SEVERITY_OF_ILLNESS',
 'AGE',
 'ADMISSION_YEAR',
 'ADMISSION_MONTH',
 'ADMISSION_DAY',
 'ADMISSION_MONTH_NAME',
 'ADMISSION_DAY_NAME',
 'ADMISSION_ILLNESS_COMB',
 'ILLNESS_BEDGRADE_COMB',
 'DEPARTMENT_ILLNESS_COMB',
 'LENGTH_OF_STAY',
 'STAY_DURATION']

In [18]:
X_train = df_train[num_columns + cat_columns]
print(X_train.shape)
X_train.head()

(236704, 25)


Unnamed: 0,AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,VISITORS_WITH_PATIENT,ADMISSION_DEPOSIT,HOSPITAL_CODE,HOSPITAL_TYPE_CODE,CITY_CODE_HOSPITAL,HOSPITAL_REGION_CODE,DEPARTMENT,WARD_TYPE,WARD_FACILITY_CODE,BED_GRADE,CITY_CODE_PATIENT,TYPE_OF_ADMISSION,SEVERITY_OF_ILLNESS,AGE,ADMISSION_YEAR,ADMISSION_MONTH,ADMISSION_DAY,ADMISSION_MONTH_NAME,ADMISSION_DAY_NAME,ADMISSION_ILLNESS_COMB,ILLNESS_BEDGRADE_COMB,DEPARTMENT_ILLNESS_COMB,LENGTH_OF_STAY,STAY_DURATION
0,3,2,4911,8,c,3,Z,radiotherapy,R,F,2.0,7.0,Emergency,Extreme,51-60,2022,8,10,August,Wednesday,Emergency-Extreme,Extreme-2.0,radiotherapy-Extreme,10,Medium Stay
1,2,2,4745,10,e,1,X,anesthesia,S,E,2.0,7.0,Trauma,Extreme,51-60,2022,9,9,September,Friday,Trauma-Extreme,Extreme-2.0,anesthesia-Extreme,40,Long Stay
2,2,2,7272,26,b,2,Y,radiotherapy,R,D,2.0,7.0,Trauma,Extreme,51-60,2022,9,22,September,Thursday,Trauma-Extreme,Extreme-2.0,radiotherapy-Extreme,50,Long Stay
3,2,2,5558,26,b,2,Y,radiotherapy,S,D,2.0,7.0,Trauma,Extreme,51-60,2022,11,2,November,Wednesday,Trauma-Extreme,Extreme-2.0,radiotherapy-Extreme,50,Long Stay
4,2,2,4449,23,a,6,X,anesthesia,S,F,2.0,7.0,Trauma,Extreme,51-60,2022,9,16,September,Friday,Trauma-Extreme,Extreme-2.0,anesthesia-Extreme,20,Long Stay


In [19]:
cat_indices = np.arange(3,25)
cat_indices

array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       20, 21, 22, 23, 24])

In [20]:
# category dict for the driftdetector to identify unique categories
categories_per_feature = {f: None for f in cat_indices}
categories_per_feature

{3: None,
 4: None,
 5: None,
 6: None,
 7: None,
 8: None,
 9: None,
 10: None,
 11: None,
 12: None,
 13: None,
 14: None,
 15: None,
 16: None,
 17: None,
 18: None,
 19: None,
 20: None,
 21: None,
 22: None,
 23: None,
 24: None}

In [21]:
# Initialize the detector
cd = TabularDrift(X_train.values, p_val=.05, categories_per_feature=categories_per_feature)

In [22]:
cd.get_config()

{'name': 'TabularDrift',
 'meta': {'version': '0.11.4'},
 'x_ref': array([[3, 2, 4911, ..., 'radiotherapy-Extreme', 10, 'Medium Stay'],
        [2, 2, 4745, ..., 'anesthesia-Extreme', 40, 'Long Stay'],
        [2, 2, 7272, ..., 'radiotherapy-Extreme', 50, 'Long Stay'],
        ...,
        [3, 15, 4241, ..., 'gynecology-Moderate', 50, 'Long Stay'],
        [4, 4, 3036, ..., 'gynecology-Extreme', 20, 'Long Stay'],
        [4, 4, 4326, ..., 'gynecology-Extreme', 30, 'Long Stay']],
       dtype=object),
 'p_val': 0.05,
 'categories_per_feature': {3: None,
  4: None,
  5: None,
  6: None,
  7: None,
  8: None,
  9: None,
  10: None,
  11: None,
  12: None,
  13: None,
  14: None,
  15: None,
  16: None,
  17: None,
  18: None,
  19: None,
  20: None,
  21: None,
  22: None,
  23: None,
  24: None},
 'x_ref_preprocessed': False,
 'preprocess_at_init': True,
 'update_x_ref': None,
 'preprocess_fn': None,
 'correction': 'bonferroni',
 'alternative': 'two-sided',
 'n_features': None,
 'input_s

In [27]:
# # Using pickle to save and load it the trained detector
with open('Trained_Drift_Detector.pkl','wb') as F:
    pickle.dump(cd,F)

with open('Trained_Drift_Detector.pkl','rb') as F:
    trained_drift_model = pickle.load(F)    

In [28]:
trained_drift_model.get_config()

{'name': 'TabularDrift',
 'meta': {'version': '0.11.4'},
 'x_ref': array([[3, 2, 4911, ..., 'radiotherapy-Extreme', 10, 'Medium Stay'],
        [2, 2, 4745, ..., 'anesthesia-Extreme', 40, 'Long Stay'],
        [2, 2, 7272, ..., 'radiotherapy-Extreme', 50, 'Long Stay'],
        ...,
        [3, 15, 4241, ..., 'gynecology-Moderate', 50, 'Long Stay'],
        [4, 4, 3036, ..., 'gynecology-Extreme', 20, 'Long Stay'],
        [4, 4, 4326, ..., 'gynecology-Extreme', 30, 'Long Stay']],
       dtype=object),
 'p_val': 0.05,
 'categories_per_feature': {3: None,
  4: None,
  5: None,
  6: None,
  7: None,
  8: None,
  9: None,
  10: None,
  11: None,
  12: None,
  13: None,
  14: None,
  15: None,
  16: None,
  17: None,
  18: None,
  19: None,
  20: None,
  21: None,
  22: None,
  23: None,
  24: None},
 'x_ref_preprocessed': False,
 'preprocess_at_init': True,
 'update_x_ref': None,
 'preprocess_fn': None,
 'correction': 'bonferroni',
 'alternative': 'two-sided',
 'n_features': None,
 'input_s

In [29]:
preds = trained_drift_model.predict(X_train.values)
labels = ['No!', 'Yes!']
print('Drift? {}'.format(labels[preds['data']['is_drift']]))

Drift? No!


In [30]:
# If you are interested in individual feature-wise drift, this is also possible:
fpreds = trained_drift_model.predict(X_train.values, drift_type='feature')
fpreds

{'data': {'is_drift': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0]),
  'distance': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
  'p_val': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32),
  'threshold': 0.05},
 'meta': {'name': 'TabularDrift',
  'online': False,
  'data_type': None,
  'version': '0.11.4',
  'detector_type': 'drift'}}

In [38]:
# Loop through each feature index in the trained drift model
for feature_index in range(trained_drift_model.n_features):
    # Determine the statistical test to use based on whether the feature is categorical or not
    statistical_test = 'Chi2' if feature_index in list(categories_per_feature.keys()) else 'K-S'
    # Get the name of the feature from the training data
    feature_name = X_train.columns.tolist()[feature_index]
    # Check if the feature is identified as drift or not
    is_drift = fpreds['data']['is_drift'][feature_index]
    # Get the statistical value and p-value for the feature
    statistical_value, p_value = fpreds['data']['distance'][feature_index], fpreds['data']['p_val'][feature_index]
    # Print the results with informative formatting
    print(f'{feature_name} -- Drift? {labels[is_drift]} -- {statistical_test} {statistical_value:.3f} -- p-value {p_value:.3f}')


AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL -- Drift? No! -- K-S 0.000 -- p-value 1.000
VISITORS_WITH_PATIENT -- Drift? No! -- K-S 0.000 -- p-value 1.000
ADMISSION_DEPOSIT -- Drift? No! -- K-S 0.000 -- p-value 1.000
HOSPITAL_CODE -- Drift? No! -- Chi2 6.000 -- p-value 1.000
HOSPITAL_TYPE_CODE -- Drift? No! -- Chi2 0.000 -- p-value 1.000
CITY_CODE_HOSPITAL -- Drift? No! -- Chi2 0.000 -- p-value 1.000
HOSPITAL_REGION_CODE -- Drift? No! -- Chi2 0.000 -- p-value 1.000
DEPARTMENT -- Drift? No! -- Chi2 0.000 -- p-value 1.000
WARD_TYPE -- Drift? No! -- Chi2 0.000 -- p-value 1.000
WARD_FACILITY_CODE -- Drift? No! -- Chi2 0.000 -- p-value 1.000
BED_GRADE -- Drift? No! -- Chi2 0.000 -- p-value 1.000
CITY_CODE_PATIENT -- Drift? No! -- Chi2 0.000 -- p-value 1.000
TYPE_OF_ADMISSION -- Drift? No! -- Chi2 0.000 -- p-value 1.000
SEVERITY_OF_ILLNESS -- Drift? No! -- Chi2 0.000 -- p-value 1.000
AGE -- Drift? No! -- Chi2 0.000 -- p-value 1.000
ADMISSION_YEAR -- Drift? No! -- Chi2 0.000 -- p-value 1.000
ADMISSION_M

In [39]:
# Create a DataFrame using a dictionary
temp = pd.DataFrame({
    'Time Period': [f'2023-01-01 to 2023-01-07'] * len(X_train.columns),
    'Features': X_train.columns.tolist(),
    'Is Drift': fpreds['data']['is_drift'],
    'Stat Test': ['Chi2' if feature in cat_columns else 'K-S' for feature in X_train.columns.tolist()],
    'Stats Value': fpreds['data']['distance'],
    'P-value': fpreds['data']['p_val']
})
# Print the shape of the DataFrame
print(temp.shape)
# Display the DataFrame
temp

(25, 6)


Unnamed: 0,Time Period,Features,Is Drift,Stat Test,Stats Value,P-value
0,2023-01-01 to 2023-01-07,AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,0,K-S,0.0,1.0
1,2023-01-01 to 2023-01-07,VISITORS_WITH_PATIENT,0,K-S,0.0,1.0
2,2023-01-01 to 2023-01-07,ADMISSION_DEPOSIT,0,K-S,0.0,1.0
3,2023-01-01 to 2023-01-07,HOSPITAL_CODE,0,Chi2,6.000422,1.0
4,2023-01-01 to 2023-01-07,HOSPITAL_TYPE_CODE,0,Chi2,0.0,1.0
5,2023-01-01 to 2023-01-07,CITY_CODE_HOSPITAL,0,Chi2,0.0,1.0
6,2023-01-01 to 2023-01-07,HOSPITAL_REGION_CODE,0,Chi2,0.0,1.0
7,2023-01-01 to 2023-01-07,DEPARTMENT,0,Chi2,0.0,1.0
8,2023-01-01 to 2023-01-07,WARD_TYPE,0,Chi2,0.0,1.0
9,2023-01-01 to 2023-01-07,WARD_FACILITY_CODE,0,Chi2,0.0,1.0


Check if data drift is working using noise data


In [33]:
temp = X_train.copy()
temp.loc[:5,'HOSPITAL_CODE'] = 100
temp.head()

Unnamed: 0,AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,VISITORS_WITH_PATIENT,ADMISSION_DEPOSIT,HOSPITAL_CODE,HOSPITAL_TYPE_CODE,CITY_CODE_HOSPITAL,HOSPITAL_REGION_CODE,DEPARTMENT,WARD_TYPE,WARD_FACILITY_CODE,BED_GRADE,CITY_CODE_PATIENT,TYPE_OF_ADMISSION,SEVERITY_OF_ILLNESS,AGE,ADMISSION_YEAR,ADMISSION_MONTH,ADMISSION_DAY,ADMISSION_MONTH_NAME,ADMISSION_DAY_NAME,ADMISSION_ILLNESS_COMB,ILLNESS_BEDGRADE_COMB,DEPARTMENT_ILLNESS_COMB,LENGTH_OF_STAY,STAY_DURATION
0,3,2,4911,100,c,3,Z,radiotherapy,R,F,2.0,7.0,Emergency,Extreme,51-60,2022,8,10,August,Wednesday,Emergency-Extreme,Extreme-2.0,radiotherapy-Extreme,10,Medium Stay
1,2,2,4745,100,e,1,X,anesthesia,S,E,2.0,7.0,Trauma,Extreme,51-60,2022,9,9,September,Friday,Trauma-Extreme,Extreme-2.0,anesthesia-Extreme,40,Long Stay
2,2,2,7272,100,b,2,Y,radiotherapy,R,D,2.0,7.0,Trauma,Extreme,51-60,2022,9,22,September,Thursday,Trauma-Extreme,Extreme-2.0,radiotherapy-Extreme,50,Long Stay
3,2,2,5558,100,b,2,Y,radiotherapy,S,D,2.0,7.0,Trauma,Extreme,51-60,2022,11,2,November,Wednesday,Trauma-Extreme,Extreme-2.0,radiotherapy-Extreme,50,Long Stay
4,2,2,4449,100,a,6,X,anesthesia,S,F,2.0,7.0,Trauma,Extreme,51-60,2022,9,16,September,Friday,Trauma-Extreme,Extreme-2.0,anesthesia-Extreme,20,Long Stay


In [34]:
# If you are interested in individual feature-wise drift, this is also possible:
fpreds = trained_drift_model.predict(temp.values, drift_type='feature')
fpreds

{'data': {'is_drift': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0]),
  'distance': array([0.       , 0.       , 0.       , 6.0004225, 0.       , 0.       ,
         0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
         0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
         0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
         0.       ], dtype=float32),
  'p_val': array([1.       , 1.       , 1.       , 0.9999999, 1.       , 1.       ,
         1.       , 1.       , 1.       , 1.       , 1.       , 1.       ,
         1.       , 1.       , 1.       , 1.       , 1.       , 1.       ,
         1.       , 1.       , 1.       , 1.       , 1.       , 1.       ,
         1.       ], dtype=float32),
  'threshold': 0.05},
 'meta': {'name': 'TabularDrift',
  'online': False,
  'data_type': None,
  'version': '0.11.4',
  'detector_type': 'drift'}}

In [40]:
for index, row in temp.iterrows():
    feature_name = row['Features']
    is_drift = row['Is Drift']
    stat = 'Chi2' if feature_name in cat_columns else 'K-S'
    stat_val = row['Stats Value']
    p_val = row['P-value']
    
    print(f'{feature_name} -- Drift? {labels[is_drift]} -- {stat} {stat_val:.3f} -- p-value {p_val:.3f}')


    

AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL -- Drift? No! -- K-S 0.000 -- p-value 1.000
VISITORS_WITH_PATIENT -- Drift? No! -- K-S 0.000 -- p-value 1.000
ADMISSION_DEPOSIT -- Drift? No! -- K-S 0.000 -- p-value 1.000
HOSPITAL_CODE -- Drift? No! -- Chi2 6.000 -- p-value 1.000
HOSPITAL_TYPE_CODE -- Drift? No! -- Chi2 0.000 -- p-value 1.000
CITY_CODE_HOSPITAL -- Drift? No! -- Chi2 0.000 -- p-value 1.000
HOSPITAL_REGION_CODE -- Drift? No! -- Chi2 0.000 -- p-value 1.000
DEPARTMENT -- Drift? No! -- Chi2 0.000 -- p-value 1.000
WARD_TYPE -- Drift? No! -- Chi2 0.000 -- p-value 1.000
WARD_FACILITY_CODE -- Drift? No! -- Chi2 0.000 -- p-value 1.000
BED_GRADE -- Drift? No! -- Chi2 0.000 -- p-value 1.000
CITY_CODE_PATIENT -- Drift? No! -- Chi2 0.000 -- p-value 1.000
TYPE_OF_ADMISSION -- Drift? No! -- Chi2 0.000 -- p-value 1.000
SEVERITY_OF_ILLNESS -- Drift? No! -- Chi2 0.000 -- p-value 1.000
AGE -- Drift? No! -- Chi2 0.000 -- p-value 1.000
ADMISSION_YEAR -- Drift? No! -- Chi2 0.000 -- p-value 1.000
ADMISSION_M

#### Data Drift Scoring:

In [36]:
def data_monitoring_batch_query(a):
    query = f"""

        SELECT CASE_ID,
               COALESCE(HOSPITAL_CODE,0) AS HOSPITAL_CODE,
               COALESCE(HOSPITAL_TYPE_CODE,'None') AS HOSPITAL_TYPE_CODE,
               COALESCE(CITY_CODE_HOSPITAL,0) AS CITY_CODE_HOSPITAL,
               COALESCE(HOSPITAL_REGION_CODE,'None') AS HOSPITAL_REGION_CODE,
               COALESCE(AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL_X,0) AS AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,
               COALESCE(DEPARTMENT,'None') AS DEPARTMENT,
               COALESCE(WARD_TYPE,'None') AS WARD_TYPE,
               COALESCE(WARD_FACILITY_CODE,'None') AS WARD_FACILITY_CODE,
               COALESCE(BED_GRADE,0) AS BED_GRADE,
               PATIENTID,
               COALESCE(CITY_CODE_PATIENT,0) AS CITY_CODE_PATIENT,
               COALESCE(TYPE_OF_ADMISSION,'None') AS TYPE_OF_ADMISSION,
               COALESCE(SEVERITY_OF_ILLNESS,'Minor') AS SEVERITY_OF_ILLNESS,
               COALESCE(VISITORS_WITH_PATIENT_X,0) AS VISITORS_WITH_PATIENT,
               COALESCE(AGE,'None') AS AGE,
               COALESCE(ADMISSION_DEPOSIT_X,0) AS ADMISSION_DEPOSIT,
               ADMISSION_DATE,
               DISCHARGE_DATE

            FROM HEALTHDB.HEALTHSCHEMA.TEMP_LOS_PREDICTION_MODEL_LOGGING_TABLE_HARI
            WHERE ADMISSION_DATE >= CURRENT_DATE-144+{a*7} AND ADMISSION_DATE < CURRENT_DATE-144+{(a+1)*7}        

        """
    return query

In [41]:
def data_monitoring(batch_id):
    # Load the train data using SQL query
    with engine.connect() as conn:
        batch_df = pd.DataFrame(pd.read_sql(data_monitoring_batch_query(batch_id), conn))
        batch_df.columns = [col.upper() for col in batch_df.columns.tolist()]
    # Define numerical, categorical, and ID columns
    num_columns = ['AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL', 'VISITORS_WITH_PATIENT', 'ADMISSION_DEPOSIT']
    id_columns = ['CASE_ID', 'PATIENTID', 'ADMISSION_DATE', 'DISCHARGE_DATE']
    cat_columns = [col for col in batch_df.columns.tolist() if col not in num_columns + id_columns]
    # Select the relevant columns for data drift monitoring
    batch_final = batch_df[num_columns + cat_columns]
    # Load the trained data drift detector from a saved file
    with open('Trained_Drift_Detector.pkl', 'rb') as F:
        trained_drift_model = pickle.load(F)    
    # Check for data drift
    fpreds = trained_drift_model.predict(batch_final.values, drift_type='feature')
    # Create a log DataFrame to store monitoring results
    log_df = pd.DataFrame()
    log_df['Time Period'] = ([str(batch_df['ADMISSION_DATE'].min()) + ' to ' + 
                              str(batch_df['ADMISSION_DATE'].max())]
                              * len(batch_final.columns.tolist())
                            )
    log_df['Total Records'] = batch_df.shape[0]
    log_df['Features'] = batch_final.columns.tolist()
    log_df['Is Drift'] = fpreds['data']['is_drift']
    log_df['Stat Test'] = log_df['Features'].apply(lambda x: 'Chi2' if x in cat_columns else 'K-S')
    log_df['Stats Value'] = np.round(fpreds['data']['distance'])
    log_df['P-value'] = np.round(fpreds['data']['p_val'])
    
    return log_df

### Model Drift detector:

In [None]:
### Model Drift detector:

In [None]:
def check_model_drift(ref_metric_dict, cur_metric_dict, model_type='classification', tolerance=0.1):
    if model_type == 'classification':
        # Calculate percentage changes for precision, recall, and ROC-AUC
        precision_change = abs((cur_metric_dict['Precision'] - ref_metric_dict['Precision']) / ref_metric_dict['Precision'])
        recall_change = abs((cur_metric_dict['Recall'] - ref_metric_dict['Recall']) / ref_metric_dict['Recall'])
        roc_auc_change = abs((cur_metric_dict['Roc-Auc'] - ref_metric_dict['Roc-Auc']) / ref_metric_dict['Roc-Auc'])

        # Initialize a counter to track the number of metrics with changes exceeding the tolerance
        counter = 0

        # Iterate through the percentage changes and check if any exceeds the tolerance
        for change in [precision_change, recall_change, roc_auc_change]:
            if change > tolerance:
                counter += 1

        if counter > 0:
            print("ALERT! There is a model drift.")
            print(f"Change in Precision: {precision_change:.2%}")
            print(f"Change in Recall: {recall_change:.2%}")
            print(f"Change in ROC-AUC: {roc_auc_change:.2%}")
            return 1
        else:
            print("There is no model drift.")
            return 0

    elif model_type == 'regression':
        # Calculate percentage changes for RMSE and MAE
        rmse_change = abs((cur_metric_dict['RMSE'] - ref_metric_dict['RMSE']) / ref_metric_dict['RMSE'])
        mae_change = abs((cur_metric_dict['MAE'] - ref_metric_dict['MAE']) / ref_metric_dict['MAE'])

        # Initialize a counter to track the number of metrics with changes exceeding the tolerance
        counter = 0

        # Iterate through the percentage changes and check if any exceeds the tolerance
        for change in [rmse_change, mae_change]:
            if change > tolerance:
                counter += 1

        if counter > 0:
            print("ALERT! There is a model drift.")
            print(f"Change in RMSE: {rmse_change:.2%}")
            print(f"Change in MAE: {mae_change:.2%}")
            return 1, rmse_change, mae_change
        else:
            print("There is no model drift.")
            return 0, 'NONE', 'NONE'


In [None]:
def model_monitoring_batch_query(a):
    query_sim = f"""

        SELECT *
        FROM TEMP_LOS_PREDICTION_MODEL_LOGGING_TABLE_HARI
        WHERE ADMISSION_DATE >= CURRENT_DATE-144+{a*7} AND ADMISSION_DATE < CURRENT_DATE-144+{(a+1)*7}
        
    """
    return query_sim

In [None]:
print(model_monitoring_batch_query(2))

In [None]:
# Loading the train data
with engine.connect() as conn:
    batch_df = pd.DataFrame(pd.read_sql(model_monitoring_batch_query(0),conn))
    batch_df.columns = [col.upper() for col in batch_df.columns.tolist()]

In [None]:
print(batch_df.shape)
batch_df.head()

In [None]:
# Creating the current performance dict (from scoring)

actual = batch_df['LOS_X']
predicted = batch_df['PREDICTED_LOS']

rmse = np.sqrt(metrics.mean_squared_error(actual,predicted))
mae = np.sqrt(metrics.mean_absolute_error(actual,predicted))
print("RMSE: ", rmse)
print("MAE: ", mae)

scoring_ref_metrics = {}
scoring_ref_metrics['RMSE'] = rmse
scoring_ref_metrics['MAE'] = mae #+ 0.2*mae
print(scoring_ref_metrics)

In [None]:
# Loading the reference performance dict (from training)

with open('MODEL_XGB_PERFM_METRICS.pkl', 'rb') as F:
    model_ref_metric = pickle.load(F)

model_ref_metric

In [None]:
check_model_drift(model_ref_metric,scoring_ref_metrics,type='regression',tol=0.1)

In [43]:
def model_monitoring(batch_id):
    # Load the training data using SQL query
    with engine.connect() as conn:
        batch_df = pd.DataFrame(pd.read_sql(model_monitoring_batch_query(batch_id), conn))
        batch_df.columns = [col.upper() for col in batch_df.columns.tolist()]
    
    # Calculate performance metrics for the current batch
    actual = batch_df['LOS_X']
    predicted = batch_df['PREDICTED_LOS']

    rmse = np.sqrt(metrics.mean_squared_error(actual, predicted))
    mae = np.sqrt(metrics.mean_absolute_error(actual, predicted))

    # Create a dictionary for the current performance metrics
    scoring_ref_metrics = {
        'RMSE': rmse,
        'MAE': mae
    }
    
    # Load the reference performance metrics from training
    with open('MODEL_XGB_PERFM_METRICS.pkl', 'rb') as F:
        model_ref_metrics = pickle.load(F)
    
    # Check for model drift and get RMSE and MAE changes
    model_drift, RMSE_CHANGE, MAE_CHANGE = check_model_drift(model_ref_metrics, scoring_ref_metrics, type='regression', tolerance=0.1)
    
    # Create a log dictionary to store monitoring results
    log = {
        'Time Period': f"{batch_df['ADMISSION_DATE'].min()} to {batch_df['ADMISSION_DATE'].max()}",
        'Total Records': batch_df.shape[0],
        'Scoring Metrics': scoring_ref_metrics,
        'Training Metrics': model_ref_metrics,
        'Model Drift IND': model_drift,
        'RMSE Change': RMSE_CHANGE,
        'MAE Change': MAE_CHANGE
    }
    
    return log
    

In [None]:
# Then create a dictionary with has two keys --> {data_drift:'Yes', model_drift:'Yes'}
# Based on the logic, it should call the retraining function 
# Once above steps are completed, create the retraining script that takes in all the data from the training and logging table (except the last 1 or 2 week)

# Model Monitoring & Retraining Pipeline:

In [None]:
data_log_df = data_monitoring(0)
model_log_dict = model_monitoring(0)

In [None]:
data_log_df

In [None]:
# Data drift condition
data_log_df['Is Drift'].sum() > 0

In [None]:
model_log_dict

In [None]:
# Model drift condition
model_log_dict['Model Drift IND']

In [None]:
# Max date for retraining 
max_date = model_log_dict['Time Period'].split(' ')[2]
max_date