In [8]:
#import the basic libraries for exploration 
import pandas as pd 
import numpy as np 
from matplotlib import pyplot as plt
import seaborn as sns 





In [38]:
#create a cleaning function
def wrangle(file):
    df = pd.read_csv(file)

    #drop columns that will not be needed 
    df.drop(columns=['Room Number', 'Doctor', 'Name'], inplace=True)

    #remove outliers, Remove trails of negative numbers 
    low, high = df['Billing Amount'].quantile([0.1, 0.9])
    mask1 = df['Billing Amount'].between(low, high)
    df = df[mask1]

    #remove Low and High cardinality data 
    df.drop(columns=['Hospital'], inplace=True)

    #calculate a new column of the length of Hospital day 
    df['Date of Admission'] = pd.to_datetime(df['Date of Admission'])
    df['Discharge Date'] = pd.to_datetime(df['Discharge Date'])
    df['Hospital_stay'] = (df['Discharge Date'] - df['Date of Admission']).dt.days
    df.drop(columns=['Date of Admission', 'Discharge Date'], inplace=True)

    #drop features  that would cause Data leakage
    df.drop(columns=['Test Results', 'Medication'], inplace=True)

    #drop null values 
    df.dropna(inplace=True)



    return df




In [40]:
#we are going to start data preprocessing for analysis 
from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

#file path 
file_path = 'F:/Data science/projects/data_analysis/Health_care_project/data/healthcare_dataset.csv'

#Import the data 
df = wrangle(file_path) 

#split the data into features and variables 
X = df.drop(columns=['Billing Amount'])
y = df['Billing Amount']

#split into train and test features 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#create a dict of model Names 
model_dict = {
    'linearregression': LinearRegression(),
    'ridge': Ridge(),
    'lasso': Lasso(),
    'decisiontree': DecisionTreeRegressor(),
    'randomforest': RandomForestRegressor(),
    'SVR': SVR()
}

#lets start processing everything 

results_score = []

for name, model in model_dict.items():
    print('Processing the Model training...')
    model = make_pipeline(
        OneHotEncoder(use_cat_names=True),
        SimpleImputer(),
        model
    )
    model.fit(X_train, y_train)

    #make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    #calculate the scores 
    mae_train = mean_absolute_error(y_train, y_pred_train)
    mae_test = mean_absolute_error(y_test, y_pred_test)
    rmse_train = np.sqrt(mae_train)
    rmse_test = np.sqrt(mae_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)

    #append the scores in the results 
    results_score.append({
        'name': name,
        'mae_train': mae_train,
        'mae_test': mae_test,
        'rmse_train': rmse_train,
        'rmse_test': rmse_test,
        'r2_train': r2_train,
        'r2_test': r2_test
    })


scores_df = pd.DataFrame(results_score)





Processing the Model training...
Processing the Model training...
Processing the Model training...
Processing the Model training...
Processing the Model training...
Processing the Model training...


In [41]:
scores_df

Unnamed: 0,name,mae_train,mae_test,rmse_train,rmse_test,r2_train,r2_test
0,linearregression,9815.805956,9865.596939,99.074749,99.325711,0.000756,-0.001161
1,ridge,9815.806449,9865.595747,99.074752,99.325705,0.000756,-0.001161
2,lasso,9815.860553,9865.407937,99.075025,99.32476,0.000755,-0.001119
3,decisiontree,76.3881,12356.477392,8.740029,111.159693,0.994113,-0.912685
4,randomforest,3593.898003,9648.177825,59.949128,98.225138,0.855593,-0.000961
5,SVR,9820.98376,9859.829016,99.100877,99.296672,4e-06,-4e-06


In [43]:
#since we already tried bagging regressor random forest let us see on How it performs on various boosters
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
import time

models = {
    'GBM': GradientBoostingRegressor(),
    'Adaboost': AdaBoostRegressor()
}
results_score_boost = []
for name, model in models.items():
    print('Processing the Model training...')
    start_time = time.time()
    model = make_pipeline(
        OneHotEncoder(use_cat_names=True),
        SimpleImputer(),
        model
    )
    model.fit(X_train, y_train)
    

    #make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    duration = time.time() - start_time

    print('training complete...')

    #calculate the scores 
    mae_train = mean_absolute_error(y_train, y_pred_train)
    mae_test = mean_absolute_error(y_test, y_pred_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)

    #append the scores in the results 
    results_score_boost.append({
        'name': name,
        'mae_train': mae_train,
        'mae_test': mae_test,
        'r2_train': r2_train,
        'r2_test': r2_test,
        'time': duration
    })


scores_df_boost = pd.DataFrame(results_score_boost)

    

Processing the Model training...
training complete...
Processing the Model training...
training complete...


In [44]:
scores_df_boost


Unnamed: 0,name,mae_train,mae_test,r2_train,r2_test,time
0,GBM,9768.866248,9862.530317,0.009572,-0.001854,8.766514
1,Adaboost,9811.17535,9860.759257,0.001127,-0.001701,1.289753


In [45]:
from sklearn.ensemble import StackingRegressor

base_learners = [
    ('Ridge', Ridge()),
    ('DT', DecisionTreeRegressor()),
    ('svr', SVR())
]

final_estimator = RandomForestRegressor()

stack_model = StackingRegressor(
    estimators=base_learners,
    final_estimator=final_estimator,
    passthrough=False
)

model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(),
    stack_model
)

model.fit(X_train, y_train)