## Healthcare Analytics
Predict the Length of a patient's stay at the hospital

### Import libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder

In [2]:
import yaml

with open('hyper-parameters.yaml', 'r') as file:
    hyper_parameters = yaml.safe_load(file)

### Define constants

In [3]:
# Constants.
COLUMN_TYPES = {
    'Hospital_code': 'category',
    'Hospital_type_code': 'category',
    'City_Code_Hospital': 'category',
    'Hospital_region_code': 'category',
    'Department': 'category',
    'Ward_Type': 'category',
    'Ward_Facility_Code': 'category',
    'Bed Grade': 'category',
    'City_Code_Patient': 'category',
    'Type of Admission': 'category',
    'Severity of Illness': 'category',
    'Age': 'category',
    'Available Extra Rooms in Hospital': 'numeric',
    'Visitors with Patient': 'numeric',
    'Admission_Deposit': 'numeric'
}

TARGET_COLUMN_NAME = "Stay"

NUMERICAL_FEATURES = [key for key in COLUMN_TYPES.keys() if COLUMN_TYPES[key] == "numeric"]
CATEGORICAL_FEATURES = [key for key in COLUMN_TYPES.keys() if COLUMN_TYPES[key] == "category"]

### Load and prepare data

In [4]:
# Importing datasets
import zipfile
with zipfile.ZipFile("Datasets.zip", 'r') as zip_ref:
    with zip_ref.open("train.csv") as f:
        patients_df = pd.read_csv(f)
    with zip_ref.open("test.csv") as f:
        test = pd.read_csv(f)

### Preprocess target column

In [5]:
le = LabelEncoder()
patients_df[TARGET_COLUMN_NAME] = le.fit_transform(patients_df[TARGET_COLUMN_NAME])

### Train-test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(patients_df.drop('Stay', axis=1), 
                                                    patients_df['Stay'], 
                                                    **hyper_parameters["train_test_split"])

### Defining the preprocessing steps

In [7]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ordinal", OrdinalEncoder()),
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, NUMERICAL_FEATURES),
    ("cat", categorical_transformer, CATEGORICAL_FEATURES),
])

### Training the XGBoost model

In [9]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(**hyper_parameters["XGBClassifier"]))
])

pipeline.fit(X_train, y_train)

y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)

print(f"Train accuracy: {accuracy_score(y_train, y_train_pred): .4f}")
print(f"Test accuracy: {accuracy_score(y_test, y_test_pred): .4f}")

Train accuracy:  0.4469
Test accuracy:  0.4249


### Preparation for the CI/CD pipeline

In [11]:
import pathlib
from giskard import Dataset, Model

raw_data = pd.concat([X_test, y_test], axis=1)

wrapped_data = Dataset(
    df=raw_data,
    name="Patients data",
    target=TARGET_COLUMN_NAME,
    cat_columns=CATEGORICAL_FEATURES
)

wrapped_model = Model(
    model=pipeline,
    model_type="classification",
    name="Patients stay period classifier [XGBoost]",
    classification_labels=pipeline.classes_.tolist(),
    feature_names=COLUMN_TYPES.keys()
)

# dumping
pathlib.Path("model").mkdir(parents=True)
pathlib.Path("dataset").mkdir(parents=True)
wrapped_data.save(pathlib.Path("dataset"), 0)
wrapped_model.save(pathlib.Path("model"))
print("Your model and dataset are successfully dumped for CI/CD.")

Your model and dataset are successfully dumped for CI/CD.
