# Predicting a hospital charge using machine learning

In [4]:
# Import data analysis tools 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score


# Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import pickle

# Assign random seed
np.random.seed(42)

In [5]:
# Import the training and validation set
df = pd.read_csv("insurance.csv")

# Checking for any missing data and datatypes
df.isna().sum(), df.dtypes

(age         0
 sex         0
 bmi         0
 children    0
 smoker      0
 region      0
 charges     0
 dtype: int64,
 age           int64
 sex          object
 bmi         float64
 children      int64
 smoker       object
 region       object
 charges     float64
 dtype: object)

In [6]:
# Encode categorical data to be binary
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'])
df.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_female', 'sex_male',
       'smoker_no', 'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest'],
      dtype='object')

In [7]:
# Split data
X = df.drop("charges", axis=1)
y = df["charges"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit and score the model
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8622165326195984

In [8]:
# Take the mean of 10-fold cross-validation score
clf_cross_val_score = np.mean(cross_val_score(model, X, y, cv=10))

# Cross-validated score
print(f'The cross-validated score is: {clf_cross_val_score*100:.2f}%')

The cross-validated score is: 83.50%


In [9]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Make predictions using our regression model
y_preds = model.predict(X_test)

# Evaluate the regression model
print("Regression model metrics on the test set")
print(f"R^2: {r2_score(y_test, y_preds)}")
print(f"MAE: {mean_absolute_error(y_test, y_preds)}")
print(f"MSE: {mean_squared_error(y_test, y_preds)}")

Regression model metrics on the test set
R^2: 0.8622165326195984
MAE: 2555.684002643311
MSE: 21390708.531345494


In [10]:
# Save the model
pickle.dump(model, open("random_forst_model_1.pkl", "wb"))

In [11]:
# Testing the model with information for a 28 year old male in the northeast
lst = [[28, 30, 0, 0, 1, 1, 0, 1, 0, 0, 0]]
lst_df = pd.DataFrame(lst, columns=['age', 'bmi', 'children', 'sex_female', 'sex_male',
       'smoker_no', 'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest'])

In [12]:
lst_df

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,28,30,0,0,1,1,0,1,0,0,0


In [13]:
model.predict(lst_df)[0]

4261.926213275

In [24]:
# Creating an index dictionary for each column in the table to be inserted
cat = df.drop("charges", axis=1)
index_dict = dict(zip(cat.columns, range(cat.shape[1])))
index_dict

{'age': 0,
 'bmi': 1,
 'children': 2,
 'sex_female': 3,
 'sex_male': 4,
 'smoker_no': 5,
 'smoker_yes': 6,
 'region_northeast': 7,
 'region_northwest': 8,
 'region_southeast': 9,
 'region_southwest': 10}

In [21]:
# Exporting the index dictionary to be used with the flask webapp
with open('cat', 'wb') as fid:
    pickle.dump(index_dict, fid, 2)