In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns




Merging the data sets

In [2]:
df_1 = pd.read_csv('Hospitalisation details.csv')
df_2 = pd.read_excel('Names.xlsx')
df_3 = pd.read_csv('Medical Examinations.csv')

In [3]:
semi_comp = pd.merge(df_1, df_2, on='Customer ID')

In [4]:
complete = pd.merge(semi_comp, df_3, on = 'Customer ID')
complete.head()

Unnamed: 0,Customer ID,year,month,date,children,charges,Hospital tier,City tier,State ID,name,BMI,HBA1C,Heart Issues,Any Transplants,Cancer history,NumberOfMajorSurgeries,smoker
0,Id2335,1992,Jul,9,0,563.84,tier - 2,tier - 3,R1013,"German, Mr. Aaron K",17.58,4.51,No,No,No,1,No
1,Id2334,1992,Nov,30,0,570.62,tier - 2,tier - 1,R1013,"Rosendahl, Mr. Evan P",17.6,4.39,No,No,No,1,No
2,Id2333,1993,Jun,30,0,600.0,tier - 2,tier - 1,R1013,"Albano, Ms. Julie",16.47,6.35,No,No,Yes,1,No
3,Id2332,1992,Sep,13,0,604.54,tier - 3,tier - 3,R1013,"Riveros Gonzalez, Mr. Juan D. Sr.",17.7,6.28,No,No,No,1,No
4,Id2331,1998,Jul,27,0,637.26,tier - 3,tier - 3,R1013,"Brietzke, Mr. Jordan",22.34,5.57,No,No,No,1,No


# EDA and Feature Engineering

In [5]:
complete.shape

(2335, 17)

In [6]:
complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2335 entries, 0 to 2334
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Customer ID             2335 non-null   object 
 1   year                    2335 non-null   object 
 2   month                   2335 non-null   object 
 3   date                    2335 non-null   int64  
 4   children                2335 non-null   int64  
 5   charges                 2335 non-null   float64
 6   Hospital tier           2335 non-null   object 
 7   City tier               2335 non-null   object 
 8   State ID                2335 non-null   object 
 9   name                    2335 non-null   object 
 10  BMI                     2335 non-null   float64
 11  HBA1C                   2335 non-null   float64
 12  Heart Issues            2335 non-null   object 
 13  Any Transplants         2335 non-null   object 
 14  Cancer history          2335 non-null   

In [None]:
complete.describe()

Checking for Null Values

In [None]:
complete.isnull().sum()

In [None]:
complete

Identifying unique values

In [None]:
complete.nunique()

In [None]:
complete['smoker'].unique()

In [None]:
complete['smoker'].value_counts()

In [None]:
complete[complete['smoker'] == '?']

In [None]:
complete['NumberOfMajorSurgeries'].value_counts()

In [None]:
complete['smoker'].replace('?','No', inplace=True )# I replaced it with the most common information becasue all other aspects are import for the integrity of the data
complete['smoker'].value_counts()

Dealing with State information
Given: The most important states to work with are R1011,R1012,R1013

To not loose a lot of information I will create Bins to categorize the states

In [None]:
specific_char = '?'

# Loop through columns
for col in complete.columns:
    if complete[col].dtype == 'object':  
        if specific_char in complete[col].values:
            mode_value = complete[col].mode().iloc[0]  
            complete[col] = complete[col].replace(specific_char, mode_value)



In [None]:
for col in complete.columns:
    if complete[col].dtype == 'int': 
        if specific_char in complete[col].values:
            mode_value = complete[col].mean().iloc[0] 
            complete[col] = complete[col].replace(specific_char, mode_value)

In [None]:
for col in complete.columns:
    if complete[col].dtype == 'float64':  
        if specific_char in complete[col].values:
            mode_value = complete[col].mean().iloc[0] 
            complete[col] = complete[col].replace(specific_char, mode_value)

In [None]:
complete['State ID'].value_counts()

In [None]:
states = ['R1013', 'R1012','R1011']

for i in complete['State ID'].values:
    if i == 'R1013':
        continue
    if i == 'R1012':
        continue
    if i == 'R1011':
        continue
    else:
        complete['State ID'].replace(i,'R0000', inplace= True)
complete['State ID'].value_counts()

In [None]:
complete['State ID'] = complete['State ID'].map({
                        'R1013': 0,
                        'R1012': 1,
                        'R1011': 2,
                        'R0000': 3
})

In [None]:
complete['State ID'].value_counts()

Cleaning Major of Surgeries since it has numbers and strings

In [None]:
complete['NumberOfMajorSurgeries'].value_counts()

Numer of major Surgery 

No major Surgery = 0

In [None]:
complete['NumberOfMajorSurgeries'] = complete['NumberOfMajorSurgeries'].map({
                        '1': 1,
                        '2': 2,
                        '3': 3,
                        'No major surgery': 0
})
complete['NumberOfMajorSurgeries']

In [None]:
complete['NumberOfMajorSurgeries'].value_counts()

Addressing Age

In [None]:
complete.head(1)

In [None]:
for i in complete['year'].values:
    str_yr = str(i)
    if len(str_yr) == 4:
        complete['year'] = complete['year'].replace(i, (2023.0 - float(str_yr)))
    else:
        continue


In [None]:
complete['year'].min(), complete['year'].max()

In [None]:
complete['age_gap'] =pd.cut(pd.to_numeric(complete['year']),
                                         bins= (19,35,45,55,66), 
                                         labels=['19-35','36-45','46-55','56-55'])

In [None]:
complete['age_gap'].value_counts()

Adding Gender to our dataset

In [None]:
complete.columns

In [None]:
complete['name']

In [None]:
import re
complete['gender']  = complete['name'].str.extract(r'(Mr\.|Ms\.|Mrs\.)')
complete.columns

In [None]:
complete['gender'].value_counts()

Male: 0


Female: 1

In [None]:
complete['gender'] = complete['gender'].map({
                        'Mr.': 0,
                        'Ms.':1,
                        'Mrs.':1
})

In [None]:
complete['gender'].value_counts().sum()

In [None]:
complete.head(1)

Other features I believe we should map or discard

In [None]:
complete = complete.drop(columns=['month', 'date','Customer ID'])

In [None]:
complete = complete.drop(columns='name')
complete.columns

In [None]:
complete['Hospital tier'].value_counts()

In [None]:
complete['Hospital tier'] = complete['Hospital tier'].map({
                        'tier - 1': 1,
                        'tier - 2' :2,
                        'tier - 3': 3
})



In [None]:
complete['Hospital tier'].value_counts()

In [None]:
complete.columns

In [None]:
complete['City tier'].value_counts()

In [None]:
complete['City tier'] = complete['City tier'].map({
                        'tier - 1': 1,
                        'tier - 2' :2,
                        'tier - 3': 3
})


In [None]:
complete['City tier'].value_counts()

Heart Issues 


No = 0
Yes = 1

In [None]:
complete['Heart Issues'].value_counts()

In [None]:
complete['Heart Issues'] = complete['Heart Issues'].map({
                        'No': 0,
                        'yes': 1
})


In [None]:
complete['Heart Issues'].value_counts()

Any Transplants


No = 0
Yes = 1

In [None]:
complete['Any Transplants'].value_counts()

In [None]:
complete['Any Transplants'] = complete['Any Transplants'].map({
                        'No': 0,
                        'yes': 1
})


In [None]:
complete['Any Transplants'].value_counts()

In [None]:
complete.head(1)

Cancer History


No = 0
Yes = 1

In [None]:
complete['Cancer history'].value_counts()

In [None]:
complete['Cancer history'] = complete['Cancer history'].map({
                        'No': 0,
                        'Yes': 1
})


In [None]:
complete['Cancer history'].value_counts()

Smoker


No = 0
Yes = 1

In [None]:
complete['smoker'].value_counts()

In [None]:
complete['smoker'] = complete['smoker'].map({
                        'No': 0,
                        'yes': 1
})


In [None]:
complete['smoker'].value_counts().sum()

In [None]:
complete.head(5)

In [None]:
complete['children'].value_counts()

EDA

In [None]:
complete['charges'].hist()
plt.title('Charge Total Frequency')

In [None]:
plt.figure(figsize=(10,10))
sns.swarmplot(data= complete, y = complete['charges'], hue = complete['gender'])
plt.title('Charge Points')

In [None]:
plt.figure(figsize=(8,8))
sns.boxplot(y =complete['charges'])

In [None]:
complete['Hospital tier'].value_counts().plot(kind = 'bar')

In [None]:
sns.histplot(data= complete, x = 'Hospital tier', hue= 'gender')

We see that Tier 2 Hosptitals has more Female patients than Tier 1 and 3 but Tier 1 and 3 had more Male patients in general

In [None]:
explode = (0, 0.1, .1)
complete.groupby('Hospital tier')['charges'].median().plot(kind ='pie', colormap= 'cool', legend= True, explode = explode, title= 'Median Charges per Hospital Tier')
plt.show();


We see that the media charge at the Tier 1 hospital was the largest in comparison to the other two Hospitals

In [None]:
complete.columns

In [None]:
sns.histplot( x=complete["Hospital tier"], hue= complete['City tier'], multiple= 'stack')

In [None]:
sns.barplot(data= complete,x=complete['age_gap'], y = complete['charges'], hue= complete['gender'])


# Statistics   

In [None]:
complete.groupby('Hospital tier')['charges'].mean()

Hospital Hypothesis testing

$H_0$ = The average  hospitalization cost for three type of hospitals are not significantly different

In [None]:
hosp_1 = complete[complete['Hospital tier'] == 1]['charges']
hosp_2 = complete[complete['Hospital tier'] == 2]['charges']
hosp_3 = complete[complete['Hospital tier'] == 3]['charges']

In [None]:
import scipy.stats as stats

F, p = stats.f_oneway(hosp_1, hosp_2,hosp_3)

print("F-statistic:", F)
print("p-value:", p)


In [None]:
alpha = 0.05
if p < alpha:
    print("Reject null hypothesis: The average hospitalization costs are significantly different.")
else:
    print("Fail to reject null hypothesis: We can't say that the average hospitalization costs are significantly different.")

City Hypothesis Testing

$H_0$ = The average hospitalization cost for the three types of citites are not significantly different



In [None]:
city_1 = complete[complete['City tier'] == 1]['charges']
city_2 = complete[complete['City tier'] == 2]['charges']
city_3 = complete[complete['City tier'] == 3]['charges']

In [None]:
F, p = stats.f_oneway(city_1, city_2,city_3)

print("F-statistic:", F)
print("p-value:", p)


In [None]:
alpha = 0.05
if p < alpha:
    print("Reject null hypothesis: The average hospitalization costs in the three cities are significantly different.")
else:
    print("Fail to reject null hypothesis: We can't say that the average hospitalization costs in the three cities are significantly different.")

Smoker Analysis

$H_0$ = The average hospitalization cost for smokers is not significantly different from the average cost for nonsmokers

Non-smoker = 0

In [None]:
smoker_0= complete[complete['smoker'] == 0]['charges']
smoker_1= complete[complete['smoker'] == 1]['charges']


In [None]:
F, p = stats.f_oneway(smoker_0, smoker_1)

print("F-statistic:", F)
print("p-value:", p)


In [None]:
alpha = 0.05
if p < alpha:
    print("Reject null hypothesis: The average hospitalization costs for smoker and non-smoker are significantly different.")
else:
    print("Fail to reject null hypothesis: We can't say that the average hospitalization costs for smoker and non-smoker are significantly different.")

Testing if heart isues and smoking are independent

In [None]:
complete.columns

In [None]:
cont_tab = pd.crosstab(complete['smoker'],complete['Heart Issues'])
cont_tab

In [None]:
from scipy.stats import chi2_contingency

chi2_score,p,dof,ex_fre  = chi2_contingency(cont_tab)
p

In [None]:
alpha = 0.05
if p < alpha:
    print("Reject null hypothesis: Smoking and heart issues are dependent.")
else:
    print("Fail to reject null hypothesis: Smoking and heart issues are independent.")

# Machine Learning

In [None]:
complete.head(5)

In [None]:
corr_mat = complete.corr()
corr_mat

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(corr_mat, cmap='cool', annot= True)

In [None]:
complete.head()

In [None]:
sns.histplot(data=complete, x = "BMI")


In [None]:
sns.histplot(data = complete, x = "HBA1C")

In [None]:
sns.histplot(data= complete, x = 'year')

In [None]:
complete.drop(columns= 'age_gap', axis= 1, inplace= True)


In [None]:
complete['BMI'].min(), complete['BMI'].max(),complete['HBA1C'].min(), complete['HBA1C'].max(), complete['charges'].min(), complete['charges'].max()

# Trial # 1

In [None]:
from sklearn.model_selection import train_test_split

X = complete.drop(columns='charges', axis= 1)
Y = complete['charges']



X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.2) 

In [None]:
from sklearn.linear_model import SGDRegressor

sgd_reg = SGDRegressor( random_state=42)

#train the model
sgd_reg.fit(X_train, y_train)



Test the model

In [None]:

y_pred = sgd_reg.predict(X_test)

Accuracy results

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")


It looks like our model is running poorly, let's try runing hyperparameter tuning and cross validate it as well

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'loss': ['squared_loss', 'huber'],
    'penalty': ['none', 'l2', 'l1', 'elasticnet'],
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0': [0.001, 0.01, 0.1]
}
grid = GridSearchCV(sgd_reg, param_grid, cv= 5, scoring='neg_root_mean_squared_error' )


grid.fit(X,Y)

grid.best_params_

estimator = grid.best_estimator_

cv_results = cross_val_score(estimator, X_train, y_train, cv=5)

print(f"Cross-validation scores: {cv_results}")
print(f"Mean Cross-validation score: {np.mean(cv_results)}")

y_pred = estimator.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# To get the Root Mean Squared Error (RMSE), you can take the square root of the MSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")



***Results***
- Lets plot how accurate our model predicted the values

In [None]:
y_pred = estimator.predict(X)
residuals = Y - y_pred
residuals

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()


We want to see our residuals as close to 0 as possible. We see that after the 10k predictions, our model starts to fail

In [None]:
plt.hist(residuals, bins=30)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.axvline(x = 0, color = 'r')
plt.title('Distribution of Residuals')
plt.show()

***Results***

In [None]:
X_train.head()

# Trial #2

Our Results were not favorable, let's try standardizing some of the columns

In [None]:
from sklearn.preprocessing import StandardScaler

feat_to_scale = ['BMI']

prescaled_data = X[feat_to_scale]

#build the scaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(prescaled_data)

#results will be an array. I need to convert back to a DF
# use feat_to_scale list to define column names - otherwise, you'll see 0 and 1 headers
scaled_df = pd.DataFrame(scaled_data, columns=feat_to_scale)




In [None]:
X.drop(feat_to_scale, axis=1)

X_scaled = pd.concat([X.drop(feat_to_scale, axis=1),
                      scaled_df]
                      , axis=1) 

X_scaled.head()


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=.2) 

sgd_reg = SGDRegressor( random_state=42)

#train the model
sgd_reg.fit(X_train, y_train)


param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'loss': ['squared_loss', 'huber'],
    'penalty': ['none', 'l2', 'l1', 'elasticnet'],
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0': [0.001, 0.01, 0.1]
}
grid = GridSearchCV(sgd_reg, param_grid, cv= 5, scoring='neg_root_mean_squared_error' )


grid.fit(X,Y)

estimator = grid.best_estimator_


cv_results = cross_val_score(estimator, X_train, y_train, cv=5)

print(f"Cross-validation scores: {cv_results}")
print(f"Mean Cross-validation score: {np.mean(cv_results)}")

y_pred = estimator.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# To get the Root Mean Squared Error (RMSE), you can take the square root of the MSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")




***Results***
-  Standardizing BMI worsen my model, let's plot and see

In [None]:
residuals = y_test- y_pred


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()


In [None]:
plt.hist(residuals, bins=30)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.axvline(x = 0, color = 'r')
plt.title('Distribution of Residuals')
plt.show()

# Trial # 3 

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
# Hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}
# Grid search
grid_rf = GridSearchCV(rf, param_grid,scoring='neg_root_mean_squared_error', cv=5)
grid_rf.fit(X_train, y_train)

y_pred = grid_rf.predict(X_test)
print("Best RF Params: ", grid_rf.best_params_)
print("RF Test RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))


residuals = y_test- y_pred

plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()


In [None]:
plt.hist(residuals, bins=30)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.axvline(x = 0, color = 'r')
plt.title('Distribution of Residuals')
plt.show()

# Trial # 4

In [None]:
import xgboost as xgb
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
# Hyperparameter grid
param_grid_xgb = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7]
}
# Grid search
grid_xgb = GridSearchCV(xg_reg, param_grid_xgb,scoring='neg_root_mean_squared_error', cv=5)
grid_xgb.fit(X_train, y_train)

y_pred=  grid_xgb.predict(X_test)

print("Best XGB Params: ", grid_xgb.best_params_)
print("XGB Test RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))


residuals = y_test- y_pred

plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()


In [None]:
X_test.shape

In [None]:
plt.hist(residuals, bins=30)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.axvline(x = 0, color = 'r')
plt.title('Distribution of Residuals')
plt.show()

***Observations***
- The first run of my models with my initial data clean up and data engineering prove unseccesful in all models cases. 
- Standardizing some of the deatures did not help as well
- The target variable has not bee modified, I will use log scale to maintain outlier integrity

# Target log adjustment

In [None]:
complete['log_charges'] = np.log1p(complete['charges'])



# SGDR Trial # 5 

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import SGDRegressor



X = complete.drop(columns = ['charges', 'log_charges'])
Y = complete['log_charges']

X_train, X_test, y_train, y_test = train_test_split(X,Y, random_state = 50)




In [None]:
sdg = SGDRegressor(random_state = 42)

In [None]:
param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'loss': ['squared_loss', 'huber'],
    'penalty': ['none', 'l2', 'l1', 'elasticnet'],
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0': [0.001, 0.01, 0.1]}

grid = GridSearchCV(sdg, param_grid, cv= 5,  scoring='neg_root_mean_squared_error')

grid.fit(X_train, y_train)

estimator = grid.best_estimator_

cv_results = cross_val_score(estimator, X_train, y_train, cv = 5)

In [None]:
print(f"Cross-validation scores: {cv_results}")
print(f"Mean Cross-validation score: {np.mean(cv_results)}")

y_pred = estimator.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# To get the Root Mean Squared Error (RMSE), you can take the square root of the MSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

In [None]:
residuals = y_test- y_pred

plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha = .6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()


In [None]:
plt.hist(residuals, bins=30)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.axvline(x = 0, color = 'r')
plt.title('Distribution of Residuals')
plt.show()

# Trial # 6 Forest Regressor, target adjustment

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
# Hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}
# Grid search
grid_rf = GridSearchCV(rf, param_grid,scoring='neg_root_mean_squared_error', cv=5)
grid_rf.fit(X_train, y_train)

y_pred = grid_rf.predict(X_test)
print("Best RF Params: ", grid_rf.best_params_)
print("RF Test RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))


residuals = y_test- y_pred

plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha = .6)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()



In [None]:
plt.hist(residuals, bins=30)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.axvline(x = 0, color = 'r')
plt.title('Distribution of Residuals')
plt.show()

# Trial # 7 XGB Target Adjustment

In [None]:
import xgboost as xgb
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
# Hyperparameter grid
param_grid_xgb = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7]
}
# Grid search
grid_xgb = GridSearchCV(xg_reg, param_grid_xgb,scoring='neg_root_mean_squared_error', cv=5)
grid_xgb.fit(X_train, y_train)

y_pred=  grid_xgb.predict(X_test)

print("Best XGB Params: ", grid_xgb.best_params_)
print("XGB Test RMSE: ", np.sqrt(mean_squared_error(y_test,y_pred)))


residuals = y_test- y_pred

plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()


plt.hist(residuals, bins=30)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.axvline(x = 0, color = 'r')
plt.title('Distribution of Residuals')
plt.show()

# **From Trials 5 to 7 you see much more promising predictions for our Model, changing the target value to a Log, greatly improved our results**

## ***Real Test Case Example:***

Estimate the cost of hospitalization for Christopher, Ms. Jayna (her date of birth is
12/28/1988, height is 170 cm, and weight is 85 kgs). She lives in a tier 1 city and her state’s
State ID is R1011. She lives with her partner and two children. She was found to be
nondiabetic (HbA1c = 5.8). She smokes but is otherwise healthy. She has had no
transplants or major surgeries. Her father died of lung cancer. Hospitalization costs will be
estimated using tier 1 hospitals.

- BMI needed to be caluculate

In [None]:
data ={
    'year':35,
    'children':2,
    'Hospital tier': 1,
    'City tier': 1,
    'State ID' : 2,
    'BMI': 29.41,
    'HBA1C': 5.8,
    'Heart Issues': 0,
    'Any Transplants': 0,
    'Cancer history': 1,
    'NumberOfMajorSurgeries':0,
    'smoker':1,
    'gender':1
    
}

In [None]:
lady_Test = pd.DataFrame(data, index = [0])

In [None]:
lady_Test

### XGB Prediction

In [None]:
y_pred_XBG =  grid_xgb.predict(lady_Test)


In [None]:
xgb_ = np.exp(y_pred_XBG)
xgb_

### Random Forest Regressor Prediction

In [None]:
y_pred_RF = grid_rf.predict(lady_Test)
rf_ = np.exp(y_pred_RF)
rf_

### SGDR Prediction

In [None]:
y_pred_SGDR = estimator.predict(lady_Test)
sgdr_ = np.exp(y_pred_SGDR)
sgdr_

### Predicted Cost

In [None]:
predicted_value_total = (xgb_ + rf_ + sgdr_) / 3 
predicted_value_total                         


In [None]:
complete.to_csv('complete.csv', index = False)

In [None]:
complete