## Problem Statement

The main goal of this project is to create an automated system to estimate the annual medical expenditure for new customers, using information such as their age, sex, BMI, children, smoking habits and region of residence. Estimates from your system will be used to determine the annual insurance premium (amount paid every month) offered to the customer.

We have data in CSV file it consists customers individual information likeage, sex, BMI, Children, smoking habits, region of residence and actual medical expenses incurred by over 1300 customers.

Dataset source: Kaggle

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px

%matplotlib inline

from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import warnings
warnings.filterwarnings('ignore')

## Data Gathering

In [None]:
df = pd.read_csv('insurance_premium_prediction.csv')

In [None]:
df

## Exploratory Data Analysis

In [None]:
df.head()

In [None]:
df.index

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.describe(include='O')

In [None]:
df.info()

In [None]:
df['age'].unique()

In [None]:
df['age'].nunique()

In [None]:
df['age'].value_counts()

In [None]:
fig = px.histogram(df, 
                   x='age', 
                   marginal='box', 
                   nbins=47, 
                   title='Distribution of Age')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
df['sex'].value_counts()

In [None]:
fig = px.histogram(df, 
                   x='sex',  
                   title='sex')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
df['bmi'].nunique()

In [None]:
fig = px.histogram(df, 
                   x='bmi', 
                   marginal='box', 
                   title='Distribution of bmi')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
df.nunique()

In [None]:
fig = px.histogram(df, 
                   x='children',  
                   title='distribution of children')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig = px.histogram(df, 
                   x='smoker',  
                   title='distribution of smoker')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig = px.histogram(df, 
                   x='region',  
                   title='distribution of region')
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig = px.histogram(df, 
                   x='expenses',  
                   marginal='box',
                   title='distribution of expenses')

fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig = px.histogram(df,
                    x='expenses',
                    marginal='box',
                    color='sex',
                    color_discrete_sequence=['green', 'orange'],
                    title="Annual Medical Expenses")
fig.update_layout(bargap=0.1)
fig.show()               


In [None]:
fig = px.histogram(df,
                    x='expenses',
                    marginal='box',
                    color='children',
                    color_discrete_sequence=['green', 'orange','red','blue','purple','yellow'],
                    title="Annual Medical Expenses")

fig.update_layout(bargap=0.1)
fig.show()    

In [None]:
fig = px.histogram(df,
                    x='expenses',
                    marginal='box',
                    color='smoker',
                    color_discrete_sequence=['green', 'orange'],
                    title="Annual Medical Expenses")

fig.update_layout(bargap=0.1)
fig.show()             

In [None]:
fig = px.histogram(df,
                    x='expenses',
                    marginal='box',
                    color='region',
                    color_discrete_sequence=['green', 'orange','red', 'blue'],
                    title="Annual Medical Expenses")

fig.update_layout(bargap=0.1)
fig.show()    

In [None]:
fig = px.histogram(df, x='smoker', color='sex', title="Smoker")

fig.update_layout(bargap=0.1)
fig.show()    

In [None]:
fig = px.histogram(df, x='region', color='smoker', title="region")

fig.update_layout(bargap=0.1)
fig.show()    

In [None]:
fig = px.scatter(df,
                x="age",
                y="expenses",
                color="smoker",
                title="Age vs Expenses")
fig.show()                


In [None]:
fig = px.scatter(df,
                x="bmi",
                y="expenses",
                color="smoker",
                title="BMI Vs Expenses")

fig.show()                

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(df.corr(), annot=True)
plt.title('Correlation Matrix')

## Feature Engineering

In [None]:
df.isna().sum()

In [None]:
df[['expenses']].head(5)

In [None]:
df.head()

In [None]:
df['sex'].value_counts()

In [None]:
df['sex'].replace(to_replace = 'male', value=1,inplace=True)
df['sex'].replace(to_replace = 'female', value=0,inplace=True)

In [None]:
df['smoker'].value_counts()

In [None]:
df['smoker'].replace(to_replace = 'yes', value=1,inplace=True)
df['smoker'].replace(to_replace = 'no', value=0,inplace=True)

In [None]:
df.head()

In [None]:
df['region'].value_counts()

In [None]:
le=LabelEncoder()
df['region'] = le.fit_transform(df['region'])

In [None]:
df.head()

In [None]:
le.classes_

In [None]:
plt.figure(figsize=(11,7))

plt.subplot(2,2,1)
sns.distplot(df['age'])

plt.subplot(2,2,2)
sns.boxplot(df['age'])

plt.subplot(2,2,3)
sns.distplot(df['bmi'])

plt.subplot(2,2,4)
sns.boxplot(df['bmi'])

#### As shown in above plots there are no outliers in age column, there are outliers in the bmi column, let's remove it

In [None]:
df.describe()

In [None]:
q1 = df['bmi'].quantile(0.25)
q3 = df['bmi'].quantile(0.75)
iqr = q3 - q1

In [None]:
q1

In [None]:
q3

In [None]:
iqr

In [None]:
upper_limit = q3 + 1.5*iqr
lower_limit = q1 - 1.5*iqr
outliers = df[(df['bmi']>upper_limit) | (df['bmi']<lower_limit)]

In [None]:
upper_limit

In [None]:
lower_limit

In [None]:
outliers

In [None]:
df.head()

In [None]:
df['BMI'] = np.where(
    df['bmi']>upper_limit,
    upper_limit,
    np.where(
        df['bmi']<lower_limit,
        lower_limit,
        df['bmi']
    )
)

In [None]:
df.head()

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(df['BMI'])

#### As shown in the above boxplot, we do not have outliers in the new_bmi column

In [None]:
df.corr()

In [None]:
df.drop(columns=['bmi'],inplace=True)

In [None]:
df.head()

In [None]:
df=df[['age','sex', 'BMI','children','smoker','region','expenses']]

In [None]:
df.head()

In [None]:
plt.figure(figsize=(11,6))
sns.heatmap(df.corr(),annot=True)

In [None]:
df.head()

## Splitting Data

In [None]:
X=df.drop('expenses',axis=1)
y=df['expenses']

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

## Linear Regression

In [None]:
# instantiating model
lr = LinearRegression()

In [None]:
# model training
lr_model = lr.fit(X_train, y_train)

In [None]:
# prediction
lr_y_pred = lr_model.predict(X_test)

In [None]:
lr_y_pred[:5]

In [None]:
y_test[:5]

In [None]:
mse=mean_squared_error(y_test,lr_y_pred)
mae=mean_absolute_error(y_test,lr_y_pred)
rmse = np.sqrt(mse)
r2score = r2_score(y_test,lr_y_pred)

In [None]:
print('linear regression testing accuracy :')
print(f'mse: {mse}')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2 score: {r2score}')

# Decision Tree

In [None]:
# decision tree algorithm testing accuracy
dtree = DecisionTreeRegressor()
dtree_model = dtree.fit(X_train, y_train)
dtree_y_pred = dtree_model.predict(X_test)

mse=mean_squared_error(y_test,dtree_y_pred)
mae=mean_absolute_error(y_test,dtree_y_pred)
rmse = np.sqrt(mse)
r2score = r2_score(y_test,dtree_y_pred)
print('decision tree algorithm testing accuracy :')
print(f'mse: {mse}')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2 score: {r2score}')

# Random Forest

In [None]:
# random forest algorithm testing accuracy
rf = RandomForestRegressor()
rf_model = rf.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)

mse=mean_squared_error(y_test,rf_y_pred)
mae=mean_absolute_error(y_test,rf_y_pred)
rmse = np.sqrt(mse)
r2score = r2_score(y_test,rf_y_pred)
print('random forest algorithm testing accuracy :')
print(f'mse: {mse}')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2 score: {r2score}')

# Adaboost

In [None]:
# adaboost algorithm testing accuracy
ab = AdaBoostRegressor()
ab_model = ab.fit(X_train, y_train)
ab_y_pred = ab_model.predict(X_test)

mse=mean_squared_error(y_test,ab_y_pred)
mae=mean_absolute_error(y_test,ab_y_pred)
rmse = np.sqrt(mse)
r2score = r2_score(y_test,ab_y_pred)
print('Adaboost algorithm testing accuracy :')
print(f'mse: {mse}')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2 score: {r2score}')

## Gradient boosting

In [None]:
# gradient boosting algorithm testing accuracy
gb = GradientBoostingRegressor()
gb_model = gb.fit(X_train, y_train)
gb_y_pred = gb_model.predict(X_test)

mse=mean_squared_error(y_test,gb_y_pred)
mae=mean_absolute_error(y_test,gb_y_pred)
rmse = np.sqrt(mse)
r2score = r2_score(y_test,gb_y_pred)
print('gradient boosting algorithm testing accuracy :')
print(f'mse: {mse}')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2 score: {r2score}')

## XGB 

In [None]:
# xgb algorithm testing accuracy
xgb = XGBRegressor()
xgb_model = xgb.fit(X_train, y_train)
xgb_y_pred = xgb_model.predict(X_test)

mse=mean_squared_error(y_test,xgb_y_pred)
mae=mean_absolute_error(y_test,xgb_y_pred)
rmse = np.sqrt(mse)
r2score = r2_score(y_test,xgb_y_pred)
print('xgb regressor algorithm testing accuracy :')
print(f'mse: {mse}')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2 score: {r2score}')

#### Gradient boosting giving best accuracy..Let's check testing and training accuracy using Gradient boosting

In [None]:
# gradient boosting algorithm testing accuracy
gb = GradientBoostingRegressor()
gb_model = gb.fit(X_train, y_train)
gb_y_pred = gb_model.predict(X_test)

mse=mean_squared_error(y_test,gb_y_pred)
mae=mean_absolute_error(y_test,gb_y_pred)
rmse = np.sqrt(mse)
r2score = r2_score(y_test,gb_y_pred)
print('gradient boosting algorithm testing accuracy :')
print(f'mse: {mse}')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2 score: {r2score}')

In [None]:
# gradient boosting algorithm training accuracy

gb_y_pred_train = gb_model.predict(X_train)

mse=mean_squared_error(y_train, gb_y_pred_train)
mae=mean_absolute_error(y_train, gb_y_pred_train)
rmse = np.sqrt(mse)
r2score = r2_score(y_train,gb_y_pred_train)
print('gradient boosting algorithm training accuracy :')
print(f'mse: {mse}')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2 score: {r2score}')

In [None]:
# dumping the model object
import pickle
model = pickle.dump(gb_model,open('model.pkl','wb'))    

## user defined function

In [None]:
def prediction(user_data):
    result = gb_model.predict([user_data])
    return result

user_data = X_test.iloc[1][::]
res = prediction(user_data)
res