In [43]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
import seaborn as sns
import pickle
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix


In [44]:
df= pd.read_csv('host_los.csv')

In [45]:
with open('host_los.pkl','rb') as file :
    data = pickle.load(file)
print(data)

        case_id  Hospital  Hospital_type  Hospital_city  Hospital_region  \
0             1         8              2              3                2   
1             2         2              2              5                2   
2             3        10              4              1                0   
3             4        26              1              2                1   
4             5        26              1              2                1   
...         ...       ...            ...            ...              ...   
318433   318434         6              0              6                0   
318434   318435        24              0              1                0   
318435   318436         7              0              4                0   
318436   318437        11              1              2                1   
318437   318438        19              0              7                1   

        Available_Extra_Rooms_in_Hospital    Department Ward_Type  \
0                 

In [46]:
data.columns

Index(['case_id', 'Hospital', 'Hospital_type', 'Hospital_city',
       'Hospital_region', 'Available_Extra_Rooms_in_Hospital', 'Department',
       'Ward_Type', 'Ward_Facility', 'Bed_Grade', 'patientid',
       'City_Code_Patient', 'Type of Admission', 'Illness_Severity',
       'Patient_Visitors', 'Age', 'Admission_Deposit', 'Stay_Days'],
      dtype='object')

In [47]:
data.isnull().sum()

case_id                                 0
Hospital                                0
Hospital_type                           0
Hospital_city                           0
Hospital_region                         0
Available_Extra_Rooms_in_Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility                           0
Bed_Grade                             113
patientid                               0
City_Code_Patient                    4532
Type of Admission                       0
Illness_Severity                        0
Patient_Visitors                        0
Age                                     0
Admission_Deposit                       0
Stay_Days                               0
dtype: int64

In [48]:
data['Bed_Grade'].value_counts()



Bed_Grade
2.0    123671
3.0    110583
4.0     57566
1.0     26505
Name: count, dtype: int64

In [49]:
data['City_Code_Patient'].value_counts()

City_Code_Patient
8.0     124011
2.0      38869
1.0      26377
7.0      23807
5.0      20079
4.0      15380
9.0      11795
15.0      8950
10.0      8174
6.0       6005
12.0      5647
3.0       3772
23.0      3698
14.0      2927
16.0      2254
13.0      1625
21.0      1602
20.0      1409
18.0      1404
19.0      1028
26.0      1023
25.0       798
27.0       771
11.0       658
28.0       521
22.0       405
24.0       360
30.0       133
29.0        98
33.0        78
31.0        59
37.0        57
32.0        52
34.0        46
35.0        16
36.0        12
38.0         6
Name: count, dtype: int64

In [50]:
data['Bed_Grade'] = data['Bed_Grade'].fillna(np.nan)
data['City_Code_Patient'] = data['City_Code_Patient'].fillna(np.nan)


In [51]:
data['Stay_Days'].value_counts()

Stay_Days
0    311755
1      6683
Name: count, dtype: int64

In [52]:
data = data.drop('case_id', axis=1)

In [53]:
cat_selector = selector(dtype_include=object)
num_selector = selector(dtype_exclude=object)
cat_cols = cat_selector(data)
num_cols = num_selector(data)
print("Categorical:", cat_cols)
print("Numerical:", num_cols)

Categorical: ['Department', 'Ward_Type', 'Ward_Facility', 'Type of Admission', 'Illness_Severity', 'Age']
Numerical: ['Hospital', 'Hospital_type', 'Hospital_city', 'Hospital_region', 'Available_Extra_Rooms_in_Hospital', 'Bed_Grade', 'patientid', 'City_Code_Patient', 'Patient_Visitors', 'Admission_Deposit', 'Stay_Days']


In [54]:

categorical_features = ['Hospital', 'Hospital_type', 'Hospital_city','Hospital_region', 'Department','Ward_Type',
                        'Ward_Facility', 'Type of Admission','Department']  

numeric_features = ['City_Code_Patient','Patient_Visitors','Admission_Deposit', 'Available_Extra_Rooms_in_Hospital', 'Bed_Grade'] 

ordinal_features =[ 'Illness_Severity','Age',]

'''preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', IterativeImputer(max_iter=10, random_state=42)),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('ord', OrdinalEncoder(), ordinal_features),
        ('onehot', OneHotEncoder(drop='first'), categorical_features)
    ])'''
numeric_pipeline = Pipeline(steps=[
    ('imputer', IterativeImputer(random_state=42)), 
    ('scaler', StandardScaler())  
])
ordinal_transformer = OrdinalEncoder()

categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False) 

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)
y = data.pop('Stay_Days')

transformed_data = preprocessor.fit_transform(data)


In [55]:
cat_transformer = preprocessor.named_transformers_['cat']
cat_columns = cat_transformer.get_feature_names_out(categorical_features)

In [56]:
remainder_columns = [col for col in data.columns if col not in (numeric_features + ordinal_features + categorical_features)]

all_columns = (
    numeric_features +
    ordinal_features +
    cat_columns.tolist() +
    remainder_columns
)
transformed_df = pd.DataFrame(transformed_data, columns=all_columns)

In [57]:
print(transformed_df)

        City_Code_Patient  Patient_Visitors  Admission_Deposit  \
0               -0.053385         -0.727923           0.027835   
1               -0.053385         -0.727923           0.987556   
2               -0.053385         -0.727923          -0.124910   
3               -0.053385         -0.727923           2.200319   
4               -0.053385         -0.727923           0.623175   
...                   ...               ...                ...   
318433           3.342631         -0.161049          -0.677923   
318434           0.158866          0.405826           1.673071   
318435           0.583368         -0.161049          -0.594189   
318436           0.158866          0.972701          -1.030342   
318437           0.158866         -0.727923          -0.118469   

        Available_Extra_Rooms_in_Hospital  Bed_Grade  Illness_Severity  Age  \
0                               -0.169177  -0.716831               0.0  5.0   
1                               -1.025217  -0.716

In [58]:
transformed_df.columns

Index(['City_Code_Patient', 'Patient_Visitors', 'Admission_Deposit',
       'Available_Extra_Rooms_in_Hospital', 'Bed_Grade', 'Illness_Severity',
       'Age', 'Hospital_1', 'Hospital_2', 'Hospital_3', 'Hospital_4',
       'Hospital_5', 'Hospital_6', 'Hospital_7', 'Hospital_8', 'Hospital_9',
       'Hospital_10', 'Hospital_11', 'Hospital_12', 'Hospital_13',
       'Hospital_14', 'Hospital_15', 'Hospital_16', 'Hospital_17',
       'Hospital_18', 'Hospital_19', 'Hospital_20', 'Hospital_21',
       'Hospital_22', 'Hospital_23', 'Hospital_24', 'Hospital_25',
       'Hospital_26', 'Hospital_27', 'Hospital_28', 'Hospital_29',
       'Hospital_30', 'Hospital_31', 'Hospital_32', 'Hospital_type_0',
       'Hospital_type_1', 'Hospital_type_2', 'Hospital_type_3',
       'Hospital_type_4', 'Hospital_type_5', 'Hospital_type_6',
       'Hospital_city_1', 'Hospital_city_2', 'Hospital_city_3',
       'Hospital_city_4', 'Hospital_city_5', 'Hospital_city_6',
       'Hospital_city_7', 'Hospital_city_9', 

In [59]:
X = transformed_df[['City_Code_Patient', 'Patient_Visitors', 'Admission_Deposit',
       'Available_Extra_Rooms_in_Hospital', 'Bed_Grade', 'Illness_Severity',
       'Age', 'Hospital_1', 'Hospital_2', 'Hospital_3', 'Hospital_4',
       'Hospital_5', 'Hospital_6', 'Hospital_7', 'Hospital_8', 'Hospital_9',
       'Hospital_10', 'Hospital_11', 'Hospital_12', 'Hospital_13',
       'Hospital_14', 'Hospital_15', 'Hospital_16', 'Hospital_17',
       'Hospital_18', 'Hospital_19', 'Hospital_20', 'Hospital_21',
       'Hospital_22', 'Hospital_23', 'Hospital_24', 'Hospital_25',
       'Hospital_26', 'Hospital_27', 'Hospital_28', 'Hospital_29',
       'Hospital_30', 'Hospital_31', 'Hospital_32', 'Hospital_type_0',
       'Hospital_type_1', 'Hospital_type_2', 'Hospital_type_3',
       'Hospital_type_4', 'Hospital_type_5', 'Hospital_type_6',
       'Hospital_city_1', 'Hospital_city_2', 'Hospital_city_3',
       'Hospital_city_4', 'Hospital_city_5', 'Hospital_city_6',
       'Hospital_city_7', 'Hospital_city_9', 'Hospital_city_10',
       'Hospital_city_11', 'Hospital_city_13', 'Hospital_region_0',
       'Hospital_region_1', 'Hospital_region_2',
       'Department_TB & Chest disease', 'Department_anesthesia',
       'Department_gynecology', 'Department_radiotherapy',
       'Department_surgery', 'Ward_Type_P', 'Ward_Type_Q', 'Ward_Type_R',
       'Ward_Type_S', 'Ward_Type_T', 'Ward_Type_U', 'Ward_Facility_A',
       'Ward_Facility_B', 'Ward_Facility_C', 'Ward_Facility_D',
       'Ward_Facility_E', 'Ward_Facility_F', 'Type of Admission_Emergency',
       'Type of Admission_Trauma', 'Type of Admission_Urgent',
       'Department_TB & Chest disease', 'Department_anesthesia',
       'Department_gynecology', 'Department_radiotherapy',
       'Department_surgery']]

In [60]:
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    confusion_matrix,
    classification_report,
    roc_curve,
    auc,
)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [62]:
from sklearn.ensemble import  RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

models = {
    'Logistic Regression': make_pipeline(LogisticRegression(max_iter=1000 , class_weight='balanced')),
    'Decision Tree': make_pipeline(DecisionTreeClassifier(class_weight='balanced')),
    'Logistic Regression (Lasso)': LogisticRegression(penalty='l1', solver='liblinear',class_weight='balanced'),
    'Logistic Regression (Ridge C=0.1)': LogisticRegression(penalty='l2', C=0.1,class_weight='balanced'),
    'Logistic Regression (Ridge C=0.01)': LogisticRegression(penalty='l2', C=0.01,class_weight='balanced'),
    'Logistic Regression (Ridge C=10.0)': LogisticRegression(penalty='l2', C=10.0,class_weight='balanced'), 
     'random forest': RandomForestClassifier(n_estimators=100, random_state=42,class_weight='balanced')

}

In [63]:
cv = KFold(n_splits = 5 , shuffle= True , random_state =42)

In [64]:
scorers = {
    'accuracy': 'accuracy',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'f1_macro': 'f1_macro'
}

results = []

for name, model in models.items():
    cv_results = cross_validate(model, X_train, y_train, cv=cv, scoring=scorers)
    
    avg_accuracy = np.mean(cv_results['test_accuracy'])
    avg_precision_macro = np.mean(cv_results['test_precision_macro'])
    avg_recall_macro = np.mean(cv_results['test_recall_macro'])
    avg_f1_macro = np.mean(cv_results['test_f1_macro'])
    
    # Append to results (focus on CV averages)
    result_dict = {
        'Model': name,
        'CV Accuracy (Mean)': avg_accuracy,
        'CV Precision (Macro, Mean)': avg_precision_macro,
        'CV Recall (Macro, Mean)': avg_recall_macro,
        'CV F1 Score (Macro, Mean)': avg_f1_macro
    }
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    result_dict['Test Classification Report'] = classification_report(y_test, y_pred)
    result_dict['Test Confusion Matrix'] = confusion_matrix(y_test, y_pred)
    
    results.append(result_dict)

print(results)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[{'Model': 'Logistic Regression', 'CV Accuracy (Mean)': 0.864639842983317, 'CV Precision (Macro, Mean)': 0.5558866539224716, 'CV Recall (Macro, Mean)': 0.841654214987865, 'CV F1 Score (Macro, Mean)': 0.5648349529619419, 'Test Classification Report': '              precision    recall  f1-score   support\n\n           0       0.99      0.86      0.93     62397\n           1       0.11      0.79      0.19      1291\n\n    accuracy                           0.86     63688\n   macro avg       0.55      0.83      0.56     63688\nweighted avg       0.98      0.86      0.91     63688\n', 'Test Confusion Matrix': array([[53949,  8448],
       [  275,  1016]], dtype=int64)}, {'Model': 'Decision Tree', 'CV Accuracy (Mean)': 0.9724278704612365, 'CV Precision (Macro, Mean)': 0.6661418625576285, 'CV Recall (Macro, Mean)': 0.663667432134375, 'CV F1 Score (Macro, Mean)': 0.6648687094108633, 'Test Classification Report': '              precision    recall  f1-score   support\n\n           0       0.99

In [89]:

model = models['Decision Tree']
y_new_pred = model.predict(X_test)
from sklearn.metrics import accuracy_score

new_accuracy = accuracy_score(y_test, y_new_pred)
new_report = classification_report(y_test, y_new_pred)
new_conf_matrix = confusion_matrix(y_test, y_new_pred)

# Print results
print(f"New Test Accuracy: {new_accuracy}")
print("New Test Classification Report:\n", new_report)
print("New Test Confusion Matrix:\n", new_conf_matrix)

New Test Accuracy: 0.9735114935309634
New Test Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     62397
           1       0.34      0.34      0.34      1291

    accuracy                           0.97     63688
   macro avg       0.66      0.66      0.66     63688
weighted avg       0.97      0.97      0.97     63688

New Test Confusion Matrix:
 [[61566   831]
 [  856   435]]
