In [7]:
import os
import pickle
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import metrics

import sqlalchemy
import snowflake.connector
from sqlalchemy import create_engine
from snowflake.sqlalchemy import *

import Preprocessing
from Preprocessing import preprocess_data

import model_selection 
from model_selection import train_and_save_best_model

import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

import joblib

from sklearn.model_selection import train_test_split


import xgboost as xgb
from datetime import datetime, timedelta
import time  
import pytz    

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, HuberRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn import metrics
from sklearn.metrics import r2_score 

In [8]:
snowflake_creds = {
    'USER_NAME': '',
    'PASSWORD': ''
}

In [None]:
# Creating the connection engine (way 1)
engine = create_engine(URL(
        account="tr09543.ap-south-1",
        user= snowflake_creds.USER_NAME,
        password= snowflake_creds.PASSWORD,
        role="ACCOUNTADMIN",
        warehouse="COMPUTE_WH",
        database="HEALTHDB",
        schema="HEALTHSCHEMA"
    ))

In [2]:
query = """

WITH BASE AS (
    SELECT
        CASE_ID,
        COALESCE(HOSPITAL_CODE, 0) AS HOSPITAL_CODE,
        COALESCE(HOSPITAL_TYPE_CODE, 'None') AS HOSPITAL_TYPE_CODE,
        COALESCE(CITY_CODE_HOSPITAL, 0) AS CITY_CODE_HOSPITAL,
        COALESCE(HOSPITAL_REGION_CODE, 'None') AS HOSPITAL_REGION_CODE,
        COALESCE(AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL, 0) AS AVAILABLE_EXTRA_ROOMS_IN_HOSPITAL,
        COALESCE(DEPARTMENT, 'None') AS DEPARTMENT,
        COALESCE(WARD_TYPE, 'None') AS WARD_TYPE,
        COALESCE(WARD_FACILITY_CODE, 'None') AS WARD_FACILITY_CODE,
        COALESCE(BED_GRADE, 0) AS BED_GRADE,
        PATIENTID,
        COALESCE(CITY_CODE_PATIENT, 0) AS CITY_CODE_PATIENT,
        COALESCE(TYPE_OF_ADMISSION, 'None') AS TYPE_OF_ADMISSION,
        COALESCE(SEVERITY_OF_ILLNESS, 'Minor') AS SEVERITY_OF_ILLNESS,
        COALESCE(VISITORS_WITH_PATIENT, 0) AS VISITORS_WITH_PATIENT,
        COALESCE(AGE, 'None') AS AGE,
        COALESCE(ADMISSION_DEPOSIT, 0) AS ADMISSION_DEPOSIT,
        ADMISSION_DATE,
        DISCHARGE_DATE
    FROM HEALTHDB.HEALTHSCHEMA.HEALTH_DATA
    WHERE ADMISSION_DATE IS NOT NULL AND DISCHARGE_DATE IS NOT NULL -- Ensure dates are valid
),
    
BASE_WITH_FEATURES AS (
    SELECT *,
        -- Extract year, month, and day as separate columns
        YEAR(ADMISSION_DATE) AS ADMISSION_YEAR,
        MONTH(ADMISSION_DATE) AS ADMISSION_MONTH,
        DAY(ADMISSION_DATE) AS ADMISSION_DAY,
        MONTHNAME(ADMISSION_DATE) AS ADMISSION_MONTH_NAME,
        DAYNAME(ADMISSION_DATE) AS ADMISSION_DAY_NAME,
        CONCAT(TYPE_OF_ADMISSION, '-', SEVERITY_OF_ILLNESS) AS ADMISSION_ILLNESS_COMB,
        CONCAT(SEVERITY_OF_ILLNESS, '-', CAST(BED_GRADE AS VARCHAR)) AS ILLNESS_BEDGRADE_COMB,
        CONCAT(DEPARTMENT, '-', SEVERITY_OF_ILLNESS) AS DEPARTMENT_ILLNESS_COMB,
        CASE -- Additional categorization using CASE statement
            WHEN DATEDIFF(day, ADMISSION_DATE, DISCHARGE_DATE) <= 7 THEN 'Short Stay'
            WHEN DATEDIFF(day, ADMISSION_DATE, DISCHARGE_DATE) <= 14 THEN 'Medium Stay'
            ELSE 'Long Stay'
        END AS STAY_DURATION,
        DATEDIFF(day, ADMISSION_DATE, DISCHARGE_DATE) AS LENGHTH_OF_STAY -- Length of Stay
    FROM BASE
    WHERE DISCHARGE_DATE >= ADMISSION_DATE -- Ensure logical discharge dates
)

SELECT * FROM BASE_WITH_FEATURES WHERE ADMISSION_DATE = CURRENT_DATE-45

"""

In [None]:
def check_n_create_model_features(df,feat_list):
    test = pd.DataFrame()
    for col in feat_list:
        if col in df.columns.tolist():
            test[col] = df[col]
        else:
            test[col] = 0
    
    return test

In [None]:
def insert_predictions_to_snowflake_table(data):
    import pandas
    import snowflake.connector
    from snowflake.connector.pandas_tools import pd_writer, write_pandas

    engine = create_engine(URL(
            account="tr09543.ap-south-1",
            user= snowflake_creds.USER_NAME,
            password= snowflake_creds.PASSWORD,
            role="ACCOUNTADMIN",
            warehouse="COMPUTE_WH",
            database="HEALTHDB",
            schema="HEALTHSCHEMA"
        ))  
    
    # Creating the logging table if not exists already
    table = 'TEMP_LOS_PREDICTION_MODEL_LOGGING_TABLE_HARI'
    
    # Inserting the data to snowflake logging table
    data.to_sql(table, engine, index=False, if_exists='append', method=pd_writer)
    return 'Success'

In [None]:
with engine.connect() as conn:
    
    # Loading the scoring data
    score_data = pd.DataFrame(pd.read_sql(query,conn))
    score_data.columns = [col.upper() for col in score_data.columns.tolist()]
    
    # Applying the preprocessing steps
    score_data_processed = Preprocessing.preprocess_data(score_data)
    
    # Applying feature selection
    final_feats = pd.read_pickle('MODEL_FEATS.pkl')
    score_data_final = check_n_create_model_features(score_data_processed,final_feats)
    
    # Getting the predictions
    #model = xgboost.XGBRegressor()
    #model.load_model('MODEL_XGB.model')
    #score_data_final['PREDICTED_LOS'] = np.ceil(model.predict(score_data_final.drop('LOS',axis=1)))
    
    # Writing the dataframe to snowflake as a table
    #score_data_final = score_data_final.reset_index()
    #score_data_table = pd.merge(score_data,score_data_final,on='CASE_ID',how='left')
    #status = insert_predictions_to_snowflake_table(score_data_table)

In [None]:
# Train and Test split
X = score_data_final.drop('LENGTH_OF_STAY',axis=1)
y = score_data_final[['LENGTH_OF_STAY']]
print(X.shape,y.shape)

x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3,stratify=y,random_state=123)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

In [None]:
import model_selection 
from model_selection import train_and_save_best_model

In [None]:
train_and_save_best_model(train_and_save_best_model(x_train, y_train, x_test, y_test, "best_regression_model.pkl"))

In [None]:
best_model = joblib.load("best_regression_model.pkl")

# Now, you can use the best model for predictions
model = None  # Initialize the model variable

if isinstance(best_model, LinearRegression):
    model = LinearRegression()
elif isinstance(best_model, RandomForestRegressor):
    model = RandomForestRegressor()
elif isinstance(best_model, xgb.XGBRegressor):
    model = xgb.XGBRegressor()

In [None]:
model.load_model('MODEL_XGB.model')
score_data_final['PREDICTED_LOS'] = np.ceil(model.predict(score_data_final.drop('LENGTH_OF_STAY',axis=1)))

In [None]:
# Writing the dataframe to snowflake as a table
score_data_final = score_data_final.reset_index()
score_data_table = pd.merge(score_data,score_data_final,on='CASE_ID',how='left')
status = insert_predictions_to_snowflake_table(score_data_table)

In [None]:
print(score_data.shape)
score_data.head()

In [None]:
print(score_data_processed.shape)
score_data_processed.head()

In [None]:
print(score_data_final.shape)
score_data_final.head()

In [None]:
print(score_data_table.shape)
score_data_table.head()