# Import Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
import statsmodels.formula.api as smf

# Import Dataset

In [4]:
path = "C:/Users/julia/Downloads/healthcare_data.csv"
health = pd.read_csv(path)
health.head()

Unnamed: 0,Available Extra Rooms in Hospital,Department,Ward_Facility_Code,doctor_name,staff_available,patientid,Age,gender,Type of Admission,Severity of Illness,health_conditions,Visitors with Patient,Insurance,Admission_Deposit,Stay (in days)
0,4,gynecology,D,Dr Sophia,0,33070,41-50,Female,Trauma,Extreme,Diabetes,4,Yes,2966.408696,8
1,4,gynecology,B,Dr Sophia,2,34808,31-40,Female,Trauma,Minor,Heart disease,2,No,3554.835677,9
2,2,gynecology,B,Dr Sophia,8,44577,21-30,Female,Trauma,Extreme,Diabetes,2,Yes,5624.733654,7
3,4,gynecology,D,Dr Olivia,7,3695,31-40,Female,Urgent,Moderate,,4,No,4814.149231,8
4,2,anesthesia,E,Dr Mark,10,108956,71-80,Male,Trauma,Moderate,Diabetes,2,No,5169.269637,34


In [None]:
# explaination of variables

    # patientid: Patient ID
    # Age: Range of age of the patient
    # gender: Gender of the patient
    # Type of Admission: Trauma, emergency or urgent
    # Severity of Illness: Extreme, moderate, or minor
    # health_conditions: Any previous health conditions suffered by the patient
    # Visitors with Patient: The number of patients who accompany the patient
    # Insurance: Does the patient have health insurance or not?
    # Admission_Deposit: The deposit paid by the patient during admission
    # Stay (in days): The number of days that the patient has stayed in the hospital. This is the target variable
    # Available Extra Rooms in Hospital: The number of rooms available during admission
    # Department: The department which will be treating the patient
    # Ward_Facility_Code: The code of the ward facility in which the patient will be admitted
    # doctor_name: The doctor who will be treating the patient
    # staff_available: The number of staff who are not occupied at the moment in the ward


# Explore the dataset

In [5]:
health.info()
# health condition are missing 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 15 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Available Extra Rooms in Hospital  500000 non-null  int64  
 1   Department                         500000 non-null  object 
 2   Ward_Facility_Code                 500000 non-null  object 
 3   doctor_name                        500000 non-null  object 
 4   staff_available                    500000 non-null  int64  
 5   patientid                          500000 non-null  int64  
 6   Age                                500000 non-null  object 
 7   gender                             500000 non-null  object 
 8   Type of Admission                  500000 non-null  object 
 9   Severity of Illness                500000 non-null  object 
 10  health_conditions                  348112 non-null  object 
 11  Visitors with Patient              5000

In [24]:
# drop the patient id from the original data
health1 = health.copy()
health1 = health1.drop(columns = ['patientid'])
# replace the missing data in health_condition(because it is category)
health1['health_conditions'] = health1["health_conditions"].replace(to_replace= np.NaN, value= "Missing")
health1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 14 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Available Extra Rooms in Hospital  500000 non-null  int64  
 1   Department                         500000 non-null  object 
 2   Ward_Facility_Code                 500000 non-null  object 
 3   doctor_name                        500000 non-null  object 
 4   staff_available                    500000 non-null  int64  
 5   Age                                500000 non-null  object 
 6   gender                             500000 non-null  object 
 7   Type of Admission                  500000 non-null  object 
 8   Severity of Illness                500000 non-null  object 
 9   health_conditions                  500000 non-null  object 
 10  Visitors with Patient              500000 non-null  int64  
 11  Insurance                          5000

In [36]:
# first check wether the numerical variabls correlation matrix
health1.select_dtypes('int').corr()

Unnamed: 0,Available Extra Rooms in Hospital,staff_available,Visitors with Patient,Stay (in days)
Available Extra Rooms in Hospital,1.0,-0.001784,0.070459,-0.019219
staff_available,-0.001784,1.0,0.000578,0.007398
Visitors with Patient,0.070459,0.000578,1.0,0.027302
Stay (in days),-0.019219,0.007398,0.027302,1.0


In [37]:
# get the category variables
category_variables = health1.select_dtypes('object').columns

In [44]:
# change category variables to numerical variables
cat_num_matrix = pd.get_dummies(data = health1, columns = category_variables, drop_first= True, dtype= int)
cat_num_matrix

Unnamed: 0,Available Extra Rooms in Hospital,staff_available,Visitors with Patient,Admission_Deposit,Stay (in days),Department_anesthesia,Department_gynecology,Department_radiotherapy,Department_surgery,Ward_Facility_Code_B,...,Type of Admission_Trauma,Type of Admission_Urgent,Severity of Illness_Minor,Severity of Illness_Moderate,health_conditions_Diabetes,health_conditions_Heart disease,health_conditions_High Blood Pressure,health_conditions_Missing,health_conditions_Other,Insurance_Yes
0,4,0,4,2966.408696,8,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,1
1,4,2,2,3554.835677,9,0,1,0,0,1,...,1,0,1,0,0,1,0,0,0,0
2,2,8,2,5624.733654,7,0,1,0,0,1,...,1,0,0,0,1,0,0,0,0,1
3,4,7,4,4814.149231,8,0,1,0,0,0,...,0,1,0,1,0,0,0,1,0,0
4,2,10,2,5169.269637,34,1,0,0,0,0,...,1,0,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,4,2,3,4105.795901,10,0,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0
499996,13,8,2,4631.550257,11,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
499997,2,3,2,5456.930075,8,0,1,0,0,1,...,0,0,0,1,0,0,1,0,0,0
499998,2,1,2,4694.127772,23,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,0


In [46]:
# remove Stay (in days)
cat_num_matrix1 = cat_num_matrix.copy()
cat_num_matrix1 = cat_num_matrix1.drop(columns= 'Stay (in days)')

In [47]:
# find all vaiables correlation matrix
# consider correlation which is bigger than 0.1 or less than -0.1

cat_num_matrix1.corr()

Unnamed: 0,Available Extra Rooms in Hospital,staff_available,Visitors with Patient,Admission_Deposit,Department_anesthesia,Department_gynecology,Department_radiotherapy,Department_surgery,Ward_Facility_Code_B,Ward_Facility_Code_C,...,Type of Admission_Trauma,Type of Admission_Urgent,Severity of Illness_Minor,Severity of Illness_Moderate,health_conditions_Diabetes,health_conditions_Heart disease,health_conditions_High Blood Pressure,health_conditions_Missing,health_conditions_Other,Insurance_Yes
Available Extra Rooms in Hospital,1.0,-0.001784,0.070459,-0.050127,-0.0029,0.045627,-0.0479,-0.005756,-0.049206,-0.019121,...,-0.035638,0.047514,0.023532,0.008689,-0.00134,-8.1e-05,0.0017,6.8e-05,0.00123,0.0019
staff_available,-0.001784,1.0,0.000578,0.000763,-0.004316,-0.005341,0.000147,0.043335,-0.002726,-0.001632,...,0.0004,-0.002287,-0.001796,0.002228,-0.001305,0.000567,-0.000479,0.000782,-0.000805,0.001481
Visitors with Patient,0.070459,0.000578,1.0,-0.069043,0.027194,-0.058815,0.023313,-0.006618,-0.08075,0.007937,...,-0.024419,0.000235,-0.010046,0.02897,-0.001514,-0.001718,-0.005739,0.009388,-0.004961,0.032858
Admission_Deposit,-0.050127,0.000763,-0.069043,1.0,0.03072,-0.042772,0.019934,0.003054,0.006685,-0.033609,...,0.016784,0.003466,0.015279,-0.003885,-0.000181,-0.001711,0.000718,-0.000851,0.00163,0.002609
Department_anesthesia,-0.0029,-0.004316,0.027194,0.03072,1.0,-0.461182,-0.140211,-0.031722,-0.159432,0.121776,...,0.056661,-0.003325,-0.039249,0.024553,-0.002208,-0.001047,0.00332,-0.001157,-0.000233,-0.000673
Department_gynecology,0.045627,-0.005341,-0.058815,-0.042772,-0.461182,1.0,-0.667163,-0.150944,0.345704,-0.256887,...,-0.018435,-0.023166,0.078683,-0.060921,-0.001426,0.002357,0.000193,-0.000848,0.000251,-0.002473
Department_radiotherapy,-0.0479,0.000147,0.023313,0.019934,-0.140211,-0.667163,1.0,-0.045891,-0.230641,0.209965,...,-0.043192,0.029711,-0.051196,0.054039,0.002553,2.5e-05,-0.002946,0.002743,-0.001631,0.002567
Department_surgery,-0.005756,0.043335,-0.006618,0.003054,-0.031722,-0.150944,-0.045891,1.0,-0.052182,-0.01767,...,0.035154,-0.020475,-0.038,0.001906,0.002307,-0.001715,0.000709,-0.001975,0.001613,-0.00126
Ward_Facility_Code_B,-0.049206,-0.002726,-0.08075,0.006685,-0.159432,0.345704,-0.230641,-0.052182,1.0,-0.088807,...,0.143878,-0.071867,-0.04199,-0.02118,0.002142,0.001307,-0.000895,-0.00063,-0.000463,-0.001614
Ward_Facility_Code_C,-0.019121,-0.001632,0.007937,-0.033609,0.121776,-0.256887,0.209965,-0.01767,-0.088807,1.0,...,-0.09484,-0.039494,-0.022316,-0.002371,-0.001943,0.000701,-3.5e-05,0.000537,-0.00022,-0.002174


### Use VIF to emlinate variables which has correlation with other variables

In [48]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [79]:
# calculate variance_inflation_factor(i th variable with the remaing variables), if this 
# calculate is bigger than some number, then remove this variables
cat_num_variables = cat_num_matrix1.columns
new_cat_num = cat_num_matrix1
judge = 0
while(judge ==0 ):
  VIF = [variance_inflation_factor(exog= new_cat_num, exog_idx= m) for m in range(len(new_cat_num.columns))]
  if max(VIF) < 5.0:
    judge = 1
  else:
    judge = 0
    variable_i = new_cat_num.columns[np.argmax(VIF)]
    new_cat_num = new_cat_num.drop(columns = variable_i)    


  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)


In [80]:
# find the new non correlated independent variables
health2 = new_cat_num.copy()
health2['Stay (in days)'] = health['Stay (in days)']

In [81]:
new_cat_num.columns

Index(['Available Extra Rooms in Hospital', 'staff_available',
       'Visitors with Patient', 'Department_anesthesia',
       'Department_radiotherapy', 'Ward_Facility_Code_C',
       'Ward_Facility_Code_D', 'Ward_Facility_Code_E', 'Ward_Facility_Code_F',
       'doctor_name_Dr John', 'doctor_name_Dr Mark', 'doctor_name_Dr Nathan',
       'doctor_name_Dr Sarah', 'doctor_name_Dr Simon', 'doctor_name_Dr Sophia',
       'Age_11-20', 'Age_31-40', 'Age_41-50', 'Age_51-60', 'Age_61-70',
       'Age_71-80', 'Age_81-90', 'Age_91-100', 'gender_Other',
       'Type of Admission_Trauma', 'Type of Admission_Urgent',
       'Severity of Illness_Minor', 'health_conditions_Diabetes',
       'health_conditions_Heart disease',
       'health_conditions_High Blood Pressure', 'health_conditions_Missing',
       'health_conditions_Other', 'Insurance_Yes'],
      dtype='object')

In [82]:
health2.columns

Index(['Available Extra Rooms in Hospital', 'staff_available',
       'Visitors with Patient', 'Department_anesthesia',
       'Department_radiotherapy', 'Ward_Facility_Code_C',
       'Ward_Facility_Code_D', 'Ward_Facility_Code_E', 'Ward_Facility_Code_F',
       'doctor_name_Dr John', 'doctor_name_Dr Mark', 'doctor_name_Dr Nathan',
       'doctor_name_Dr Sarah', 'doctor_name_Dr Simon', 'doctor_name_Dr Sophia',
       'Age_11-20', 'Age_31-40', 'Age_41-50', 'Age_51-60', 'Age_61-70',
       'Age_71-80', 'Age_81-90', 'Age_91-100', 'gender_Other',
       'Type of Admission_Trauma', 'Type of Admission_Urgent',
       'Severity of Illness_Minor', 'health_conditions_Diabetes',
       'health_conditions_Heart disease',
       'health_conditions_High Blood Pressure', 'health_conditions_Missing',
       'health_conditions_Other', 'Insurance_Yes', 'Stay (in days)'],
      dtype='object')

# Use OLS to remove unimportant variables

In [83]:
from scipy.stats import boxcox
from statsmodels.formula.api import ols

In [84]:
# use boxcox find appropriate lambda and normalize the stay
stay = health2['Stay (in days)']
stay1, lambda1 = boxcox(stay)
lambda2 = lambda1//0.5 * 0.5
stay2 = boxcox(stay, lambda2) 

In [85]:
health2['Stay (in days)'] = stay2

In [86]:
# use ols 
b = 'Stay (in days)' + ' ~ ' ' + '.join(new_cat_num.columns)
# result = ols(formula= , data = health2).fit()
b 

'Stay (in days)Available Extra Rooms in Hospital ~  + staff_available ~  + Visitors with Patient ~  + Department_anesthesia ~  + Department_radiotherapy ~  + Ward_Facility_Code_C ~  + Ward_Facility_Code_D ~  + Ward_Facility_Code_E ~  + Ward_Facility_Code_F ~  + doctor_name_Dr John ~  + doctor_name_Dr Mark ~  + doctor_name_Dr Nathan ~  + doctor_name_Dr Sarah ~  + doctor_name_Dr Simon ~  + doctor_name_Dr Sophia ~  + Age_11-20 ~  + Age_31-40 ~  + Age_41-50 ~  + Age_51-60 ~  + Age_61-70 ~  + Age_71-80 ~  + Age_81-90 ~  + Age_91-100 ~  + gender_Other ~  + Type of Admission_Trauma ~  + Type of Admission_Urgent ~  + Severity of Illness_Minor ~  + health_conditions_Diabetes ~  + health_conditions_Heart disease ~  + health_conditions_High Blood Pressure ~  + health_conditions_Missing ~  + health_conditions_Other ~  + Insurance_Yes'