In [408]:
# library  imports
import numpy as np
import pandas as pd

In [409]:
# import the data
data = pd.read_csv('insurance-premium-predictor/insurance.csv')

# Exploratory Data Analysis

In [410]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [411]:
#Are there any null values?
data.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [412]:
# Verify data type
data.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [413]:
# What unique values do the object type columns have?
data.sex.unique(), data.sex.nunique()

(array(['female', 'male'], dtype=object), 2)

In [414]:
data.smoker.unique(), data.smoker.nunique()

(array(['yes', 'no'], dtype=object), 2)

In [415]:
data.region.unique(), data.region.nunique()

(array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object), 4)

### Label encode to change objects to numeric data for the models

In [416]:
from sklearn.preprocessing import LabelEncoder

In [417]:
sex_enc = LabelEncoder()
smoke_enc = LabelEncoder()
reg_enc = LabelEncoder()

In [418]:
data['sex'] = sex_enc.fit_transform(data['sex'])
data['smoker'] = smoke_enc.fit_transform(data['smoker'])
data['region'] = reg_enc.fit_transform(data['region'])

In [419]:
# Verify encoding
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


## Split the data

In [420]:
from sklearn.model_selection import train_test_split

In [421]:
# Leave 33 percent of data for testing
X_train, X_test, y_train, y_test = train_test_split(data[data.columns[:-2]], data[data.columns[-1]], test_size=0.33, random_state=10)

## Linear Regression

In [422]:
# Import model
from sklearn.linear_model import LinearRegression

In [423]:
# Creating an instance of Regression model to use in our code
reg_model = LinearRegression()

In [424]:
# Fit the model
reg_model.fit(X_train, y_train)

In [425]:
# test the model
print(f'Score on training set: {reg_model.score(X_train, y_train)}')
print(f'Coefficients: {reg_model.coef_}')
print(f'Intercept: {reg_model.intercept_}')

Score on training set: 0.7652955807027962
Coefficients: [  266.06611633  -299.84187398   333.53309941   643.33205661
 24363.0637539 ]
Intercept: -12851.75230417132


## Testing Linear Regression

In [426]:
from sklearn.metrics import r2_score

In [427]:
# Test the test set
predictions = reg_model.predict(X_test)

In [428]:
print(f"Score on test set: {r2_score(y_test,predictions)}")

Score on test set: 0.7082160188995364


## Random  Forest Implementation

In [429]:
from sklearn.ensemble import RandomForestRegressor

In [430]:
rf_model = RandomForestRegressor()

In [431]:
# fit the data
rf_model.fit(X_train,y_train)

In [432]:
print(f"Score on training set: {rf_model.score(X_train,y_train)}")

Score on training set: 0.9766431393918313


In [433]:
# Test the test set
rf_predictions = rf_model.predict(X_test)

In [434]:
print(f"Score on test set: {r2_score(y_test, rf_predictions)}")

Score on test set: 0.8099627128194429
