In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv('insurance.csv')

In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


We must convert this categorical data into nominal data, we will use LabelEncoder for that

In [5]:
le = LabelEncoder()
le.fit(df.sex.drop_duplicates())
list(le.classes_)

['female', 'male']

In [6]:
df.sex = le.transform(df.sex)
le.fit(df.smoker.drop_duplicates())
list(le.classes_)

['no', 'yes']

In [7]:
df.smoker = le.transform(df.smoker)
le.fit(df.region.drop_duplicates())
list(le.classes_)

['northeast', 'northwest', 'southeast', 'southwest']

In [8]:
df.region = le.transform(df.region)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


As we have now converted this categorical data into nominal data, let us go ahead and create our model in order to fit this data.

Let us prepare the data by dividing it into training and testing data, we divide the data into independent features, that is X. We won't include region as it has negligible impact on the prediction. Y will be the medical cost as that needs to be predicted.

In [13]:
X = df.drop(['charges','region'], axis = 1)
y = df.charges


In [14]:
X

Unnamed: 0,age,sex,bmi,children,smoker
0,19,0,27.900,0,1
1,18,1,33.770,1,0
2,28,1,33.000,3,0
3,33,1,22.705,0,0
4,32,1,28.880,0,0
...,...,...,...,...,...
1333,50,1,30.970,3,0
1334,18,0,31.920,0,0
1335,18,0,36.850,0,0
1336,21,0,25.800,0,0


In [15]:
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

We must split the data into training and testing, we do that by using the train_test_split method of the sklearn module:

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y)
(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

((1003, 5), (1003,), (335, 5), (335,))

I've decided to use Random Forest as my regression model in order to predict the medical costs. Any model can be chosen but my intuition behind choosing Random Forest is because Decision Trees are easily implemented and are efficient, plus Random Forest overcomes their inaccuracy by pooling multiple decision trees to overcome the inaccuracy downfall, making Random Forests extremely strong regressors.

In [49]:
from sklearn.ensemble import RandomForestRegressor

We initialize the Random Forest. Here, we will choose the number of estimators, ie. the number of trees in this random forest. The criterion, or the loss function to evaluate the efficiency of this Random Forest is going to be the mean-squared-error. The random state, or the seed will be set to 1 and the number of jobs will be -1, which probably means using all processors as per the documentation. 

In [51]:
forest = RandomForestRegressor(n_estimators = 100,
                              criterion = 'mse',
                              random_state = 1,
                              n_jobs = -1)
forest.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                      oob_score=False, random_state=1, verbose=0,
                      warm_start=False)

We make predictions using this Forest on the testing data:

In [53]:
y_predictions = forest.predict(X_test)

We import the necessary metrics that will be used to evaluate the performance of this model:

In [54]:
from sklearn.metrics import r2_score, explained_variance_score

In [55]:
y_predictions.shape, y_test.shape

((335,), (335,))

In [56]:
r2_score(y_test,y_predictions)

0.8172058608145433

In [57]:
explained_variance_score(y_test, y_predictions)

0.8174514266262684

We can see that we get a score of 0.81 which is pretty decent. I have implemented this using LinearRegression but the score achieved by RandomForest is significantly better.

If we deploy this model, let us try to create a custom input for the model so that it can predict the medical cost for us, let us start by getting one sample from the testing data and see the prediction and true value for this data is:

In [58]:
np.array(X_test)[0]

array([33.   ,  0.   , 26.695,  0.   ,  0.   ])

In [59]:
X_test.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker'], dtype='object')

As we can see, the sample can be described as follows:

Age = 33 years old

Sex = 0 (Female, 0 -> Female, 1 -> Male)

BMI = 26.695

No. of Children = 0

Smoker = 0 (Doesn't smoke, 0 -> Doesn't Smoke, 1 -> Smokes)

Let us predict the medical cost for this sample data:

In [62]:
sample = np.array(X_test)[0]
sample

array([33.   ,  0.   , 26.695,  0.   ,  0.   ])

In [63]:
sample.shape

(5,)

In [70]:
sample = sample.reshape(-1,5)

In [71]:
sample.shape

(1, 5)

In [72]:
sample

array([[33.   ,  0.   , 26.695,  0.   ,  0.   ]])

In [73]:
prediction = forest.predict(sample)

Let us compare the predicted value and the actual value:

In [77]:
print(f"Predicted Value:{np.round(prediction[0],3)}, Actual Value:{np.round(np.array(y_test)[0],3)}")

Predicted Value:4236.64, Actual Value:4571.413


We can see the that the model is not extremely accurate, but it is close enough to get a rough estimate about the medical cost that may incur.

Instead of a sample from the, let us give the model some custom data and feed it some random values to see the interesting prediction:

In [78]:
my_data = [21, 1, 26, 0, 0]

In [79]:
my_data = np.array(my_data)
my_data

array([21,  1, 26,  0,  0])

In [82]:
my_data = my_data.reshape(-1,5)
my_data

array([[21,  1, 26,  0,  0]])

In [85]:
prediction = forest.predict(my_data)
print(f'Estimated Medical Cost:{prediction[0]}')

Estimated Medical Cost:2044.238575


This infact, is pretty accurate in my case :)